├── .gitignore
├── .gitmodules
├── COPYING
├── Makefile
├── NOTICE
├── README.md
├── art
    ├── architecture.png
    ├── art.go
    ├── art.sh
    ├── results.png
    ├── webui.png
    ├── yesiscan.png
    └── yesiscan.svg
├── backend
    ├── askalono.go
    ├── askalono
    │   ├── .gitignore
    │   └── askalono.go
    ├── bitbake.go
    ├── cran.go
    ├── cran_test.go
    ├── cran_test_cases
    │   ├── test_case0.error
    │   ├── test_case0.input
    │   ├── test_case0.output
    │   ├── test_case1.error
    │   ├── test_case1.input
    │   ├── test_case1.output
    │   ├── test_case2.error
    │   ├── test_case2.input
    │   ├── test_case2.output
    │   ├── test_case3.error
    │   ├── test_case3.input
    │   ├── test_case3.output
    │   ├── test_case4.error
    │   ├── test_case4.input
    │   ├── test_case4.output
    │   ├── test_case5.error
    │   ├── test_case5.input
    │   └── test_case5.output
    ├── licenseclassifier.go
    ├── pom.go
    ├── regexp.go
    ├── regexpcore.go
    ├── scancode.go
    └── spdx.go
├── cmd
    └── yesiscan
    │   ├── .gitignore
    │   ├── .goreleaser.yaml
    │   ├── Makefile
    │   ├── main.go
    │   └── web.go
├── examples
    ├── DESCRIPTION
    ├── big5.json
    ├── config.json
    ├── pom.xml
    ├── regexp.json
    └── ssh.config
├── go.mod
├── go.sum
├── interfaces
    └── interfaces.go
├── iterator
    ├── bzip2.go
    ├── fs.go
    ├── git.go
    ├── gzip.go
    ├── http.go
    ├── iterator.go
    ├── tar.go
    ├── util.go
    ├── util_test.go
    └── zip.go
├── lib
    ├── lib.go
    ├── main.go
    ├── profiles.go
    └── results.go
├── parser
    └── parser.go
├── s3
    ├── s3.go
    └── screenshot-s3-public-bucket.png
├── util
    ├── ansi
    │   └── ansi.go
    ├── errwrap
    │   └── errwrap.go
    ├── licenses
    │   ├── licenses.go
    │   └── licenses_test.go
    ├── safepath
    │   └── safepath.go
    └── util.go
└── web
    ├── static
        ├── 4a90d9.jpg
        ├── icons8-checkmark.svg
        └── icons8-search.svg
    └── web.go


/.gitignore:
--------------------------------------------------------------------------------
1 | old/
2 | tmp/
3 | .autoConfigURI
4 | .autoConfigCookiePath
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "util/licenses/license-list-data"]
2 | 	path = util/licenses/license-list-data
3 | 	url = https://github.com/spdx/license-list-data/
4 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all build gofmt
 2 | 
 3 | all: build
 4 | 
 5 | build:
 6 | 	$(MAKE) --quiet -C cmd/yesiscan
 7 | 
 8 | gofmt:
 9 | 	# TODO: remove gofmt once goimports has a -s option
10 | 	find . -maxdepth 9 -type f -name '*.go' -not -path './old/*' -not -path './tmp/*' -not -path './vendor/*' -exec gofmt -s -w {} \;
11 | 	find . -maxdepth 9 -type f -name '*.go' -not -path './old/*' -not -path './tmp/*' -not -path './vendor/*' -exec goimports -w {} \;
12 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | Written by James Shubin <purple@amazon.com> and the project contributors
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 5 | this file except in compliance with the License. You may obtain a copy of the
 6 | License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software distributed
11 | under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | specific language governing permissions and limitations under the License.
14 | 
15 | We will never require a CLA to submit a patch. All contributions follow the
16 | `inbound == outbound` rule.
17 | 
18 | This is not an official Amazon product. Amazon does not offer support for this
19 | project.
20 | 


--------------------------------------------------------------------------------
/art/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/architecture.png


--------------------------------------------------------------------------------
/art/art.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | package art
25 | 
26 | import (
27 | 	_ "embed"
28 | )
29 | 
30 | //go:embed yesiscan.svg
31 | var YesiscanSvg []byte
32 | 


--------------------------------------------------------------------------------
/art/art.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | inkscape --without-gui --export-png "yesiscan.png" --export-width 400 "yesiscan.svg"
4 | 


--------------------------------------------------------------------------------
/art/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/results.png


--------------------------------------------------------------------------------
/art/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/webui.png


--------------------------------------------------------------------------------
/art/yesiscan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/yesiscan.png


--------------------------------------------------------------------------------
/backend/askalono.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"bytes"
 29 | 	"context"
 30 | 	"encoding/json"
 31 | 	"fmt"
 32 | 	"os"
 33 | 	"os/exec"
 34 | 	"strings"
 35 | 	"syscall"
 36 | 
 37 | 	"github.com/awslabs/yesiscan/backend/askalono"
 38 | 	"github.com/awslabs/yesiscan/interfaces"
 39 | 	"github.com/awslabs/yesiscan/util/errwrap"
 40 | 	"github.com/awslabs/yesiscan/util/licenses"
 41 | 	"github.com/awslabs/yesiscan/util/safepath"
 42 | )
 43 | 
 44 | const (
 45 | 	// AskalonoConfidenceError is the error string askalono returns for when
 46 | 	// it doesn't have high enough confidence in a file.
 47 | 	AskalonoConfidenceError = "Confidence threshold not high enough for any known license"
 48 | )
 49 | 
 50 | // Askalono is based on the rust askalono project. It uses the Sørensen–Dice
 51 | // coefficient for license comparison. It would be pretty easy, and preferable
 52 | // to use one of the many pre-existing golang Sørensen–Dice implementations and
 53 | // to have a pure golang solution for this, however it would be good to have at
 54 | // least one backend that exec's out to a remote process, and since this one is
 55 | // fairly self-contained, it is a good example to use before we try and wrap
 56 | // something more complicated like scancode.
 57 | // See: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
 58 | type Askalono struct {
 59 | 	Debug  bool
 60 | 	Logf   func(format string, v ...interface{})
 61 | 	Prefix safepath.AbsDir
 62 | 
 63 | 	// binary is the path of the executable to run.
 64 | 	binary string
 65 | }
 66 | 
 67 | func (obj *Askalono) String() string {
 68 | 	return "askalono"
 69 | }
 70 | 
 71 | func (obj *Askalono) Setup(ctx context.Context) error {
 72 | 	// This runs --help to check this is in the path and running properly.
 73 | 	// It also unpacks the embedded askalono binary if we have one to use!
 74 | 
 75 | 	name, err := askalono.GetExpectedName() // what the binary expected to be named
 76 | 	if err != nil {
 77 | 		return err
 78 | 	}
 79 | 	obj.binary = name.Path() // the default
 80 | 
 81 | 	relDir := safepath.UnsafeParseIntoRelDir("askalono/")
 82 | 	prefix := safepath.JoinToAbsDir(obj.Prefix, relDir)
 83 | 	if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil {
 84 | 		return err
 85 | 	}
 86 | 
 87 | 	if size, absFile, err := askalono.InstallBinary(prefix); err != nil {
 88 | 		// not a permanent error, we can fall back to anything built-in
 89 | 		obj.Logf("unpacking binary failed: %v", err)
 90 | 	} else {
 91 | 		obj.binary = absFile.Path() // use this specific path instead!
 92 | 		// TODO: change to human readable bytes
 93 | 		obj.Logf("installed: %d bytes to disk at %s", size, obj.binary)
 94 | 	}
 95 | 
 96 | 	args := []string{"--help"}
 97 | 
 98 | 	prog := fmt.Sprintf("%s %s", obj.binary, strings.Join(args, " "))
 99 | 
100 | 	obj.Logf("running: %s", prog)
101 | 
102 | 	// TODO: do we need to do the ^C handling?
103 | 	// XXX: is the ^C context cancellation propagating into this correctly?
104 | 	cmd := exec.CommandContext(ctx, obj.binary, args...)
105 | 	cmd.Dir = ""
106 | 	cmd.Env = []string{}
107 | 	cmd.SysProcAttr = &syscall.SysProcAttr{
108 | 		Setpgid: true,
109 | 		Pgid:    0,
110 | 	}
111 | 
112 | 	if err := cmd.Run(); err != nil {
113 | 		if e, ok := err.(*exec.Error); ok && e.Err == exec.ErrNotFound {
114 | 			// TODO: this error message is CLI specific, but should be generalized
115 | 			obj.Logf("either run with --no-backend-askalono or install askalono into your $PATH")
116 | 		}
117 | 
118 | 		obj.Logf("your %s doesn't seem to be working properly, check how it was installed?", obj.binary)
119 | 		return errwrap.Wrapf(err, "error running: %s", prog)
120 | 	}
121 | 
122 | 	return nil
123 | }
124 | 
125 | func (obj *Askalono) ScanPath(ctx context.Context, path safepath.Path, info *interfaces.Info) (*interfaces.Result, error) {
126 | 
127 | 	if info.FileInfo.IsDir() { // path.IsDir() should be the same.
128 | 		return nil, nil // skip
129 | 	}
130 | 	if info.FileInfo.Size() == 0 {
131 | 		return nil, nil // skip
132 | 	}
133 | 
134 | 	filename := path.Path()
135 | 
136 | 	ctx, cancel := context.WithCancel(ctx)
137 | 	defer cancel()
138 | 
139 | 	// yes the args need to go in this order, nothing else works...
140 | 	args := []string{"--format", "json", "identify", "--optimize", filename}
141 | 
142 | 	prog := fmt.Sprintf("%s %s", obj.binary, strings.Join(args, " "))
143 | 
144 | 	// TODO: add a progress bar of some sort somewhere
145 | 	if obj.Debug {
146 | 		obj.Logf("running: %s", prog)
147 | 	}
148 | 
149 | 	// TODO: do we need to do the ^C handling?
150 | 	// XXX: is the ^C context cancellation propagating into this correctly?
151 | 	cmd := exec.CommandContext(ctx, obj.binary, args...)
152 | 
153 | 	cmd.Dir = ""
154 | 	cmd.Env = []string{}
155 | 
156 | 	// ignore signals sent to parent process (we're in our own group)
157 | 	cmd.SysProcAttr = &syscall.SysProcAttr{
158 | 		Setpgid: true,
159 | 		Pgid:    0,
160 | 	}
161 | 
162 | 	out, reterr := cmd.Output()
163 | 	if reterr != nil {
164 | 		if obj.Debug {
165 | 			obj.Logf("error running: %s", prog)
166 | 		}
167 | 		// XXX: bug: https://github.com/jpeddicord/askalono/issues/74
168 | 		// don't error here because it might be askalono erroring but
169 | 		// still returning output as an error message... it should not
170 | 		// have been written this way, but askalono team probably won't
171 | 		// change things now.
172 | 		//return nil, errwrap.Wrapf(reterr, "error running: %s", prog)
173 | 	}
174 | 
175 | 	buffer := bytes.NewBuffer(out)
176 | 	if buffer.Len() == 0 {
177 | 		// XXX: bug: https://github.com/jpeddicord/askalono/issues/74
178 | 		obj.Logf("askalono EOF bug, skipped: %s", filename)
179 | 		return nil, nil // skip, unfortunately
180 | 	}
181 | 	decoder := json.NewDecoder(buffer)
182 | 
183 | 	var askalonoOutput AskalonoOutput // this gets populated during decode
184 | 	if err := decoder.Decode(&askalonoOutput); err != nil {
185 | 		// programming error, report this to us please
186 | 		return nil, errwrap.Wrapf(err, "error decoding askalono json output")
187 | 	}
188 | 
189 | 	if askalonoOutput.Path != "" && askalonoOutput.Path != filename {
190 | 		// programming error (probably in askalono)
191 | 		if obj.Debug {
192 | 			obj.Logf("expected: %s", filename)
193 | 			obj.Logf("got path: %s", askalonoOutput.Path)
194 | 		}
195 | 		return nil, fmt.Errorf("path did not match what was expected")
196 | 	}
197 | 
198 | 	if reterr != nil && askalonoOutput.Error == "" {
199 | 		// probably a bug in askalono
200 | 		return nil, errwrap.Wrapf(reterr, "askalono bug, error running: %s", prog)
201 | 	}
202 | 
203 | 	if reterr != nil && askalonoOutput.Error == AskalonoConfidenceError {
204 | 		return nil, nil // skip
205 | 	}
206 | 
207 | 	if e := askalonoOutput.Error; reterr != nil && e != "" {
208 | 		return nil, fmt.Errorf("unhandled askalono error: %s", e)
209 | 	}
210 | 
211 | 	if askalonoOutput.Result == nil {
212 | 		return nil, nil // didn't find anything
213 | 	}
214 | 
215 | 	return askalonoResultHelper(askalonoOutput.Result)
216 | }
217 | 
218 | // AskalonoOutput is modelled after the askalono output format.
219 | //
220 | // example:
221 | //
222 | //	{
223 | //		"path": "/home/ANT.AMAZON.COM/purple/code/license-finder-repo/spdx.go",
224 | //		"result": {
225 | //			"score": 0.9310345,
226 | //			"license": {
227 | //				"name": "MIT",
228 | //				"kind":"original",
229 | //				"aliases": []
230 | //			},
231 | //			"containing": [
232 | //				{
233 | //					"score":0.993865,
234 | //					"license": {
235 | //						"name":"MIT",
236 | //						"kind":"original",
237 | //						"aliases": []
238 | //					},
239 | //					"line_range":[17,26]
240 | //				}
241 | //			]
242 | //		}
243 | //	}
244 | type AskalonoOutput struct {
245 | 	// Path is an absolute file path to the file being scanned.
246 | 	Path string `json:"path"`
247 | 
248 | 	// Result specifies what it found.
249 | 	Result *AskalonoResultContaining `json:"result"`
250 | 
251 | 	// Error is a string returned instead of Result on askalono error.
252 | 	Error string
253 | }
254 | 
255 | // AskalonoResult is the generic result format returned by askalono. It is
256 | // usually augmented by an additional field. That can be found in
257 | // AskalonoResultRanged or AskalonoResultContaining.
258 | type AskalonoResult struct {
259 | 	// Score is the matching score found. A 1.00 is a perfect match.
260 | 	Score float64 `json:"score"`
261 | 
262 | 	// License points to the license information attached with this find.
263 | 	License *AskalonoLicense `json:"license"`
264 | }
265 | 
266 | // AskalonoResultRanged is a version of the AskalonoResult that also contains
267 | // the line range information.
268 | type AskalonoResultRanged struct {
269 | 	*AskalonoResult
270 | 
271 | 	// LineRangeRaw specifies where the match was found.
272 | 	LineRangeRaw []int64 `json:"line_range"`
273 | 
274 | 	// TODO: add LineRangeStart and LineRangeEnd and Unmarshall into there!
275 | }
276 | 
277 | // AskalonoResultContaining is a version of the AskalonoResult that also
278 | // contains a list of additional AskalonoResultRanged matches.
279 | type AskalonoResultContaining struct {
280 | 	*AskalonoResult
281 | 
282 | 	// Containing has some further information about the output. It isn't
283 | 	// always populated, and I think it is only used when --optimize is used
284 | 	// *and* it didn't find an exact match. It lists all the other matches
285 | 	// it found.
286 | 	Containing []*AskalonoResultRanged `json:"containing"`
287 | }
288 | 
289 | // AskalonoLicense is the format of the license struct returned by askalono.
290 | type AskalonoLicense struct {
291 | 	// Name is the SPDX name of the license found.
292 | 	Name string `json:"name"`
293 | 
294 | 	// Kind is some sort of license tag. So far I've found "original".
295 | 	Kind string `json:"kind"`
296 | 
297 | 	// Aliases is probably aliases for this license. I've not found this
298 | 	// output anywhere atm, so I've left it as an interface.
299 | 	Aliases []interface{} `json:"aliases"`
300 | }
301 | 
302 | func askalonoResultHelper(result *AskalonoResultContaining) (*interfaces.Result, error) {
303 | 	if result == nil {
304 | 		return nil, fmt.Errorf("got nil result")
305 | 	}
306 | 
307 | 	if result.AskalonoResult != nil && result.AskalonoResult.License != nil {
308 | 		return askalonoLicenseHelper(result.AskalonoResult.License, result.Score)
309 | 	}
310 | 
311 | 	if len(result.Containing) == 0 {
312 | 		// programming error (probably in askalono)
313 | 		return nil, fmt.Errorf("got nil license")
314 | 	}
315 | 
316 | 	// TODO: add file content ranges
317 | 	// XXX: askalono can't currently find more than one license at a time,
318 | 	// so we don't handle that more complicated case for now. More info:
319 | 	// https://github.com/jpeddicord/askalono/issues/40
320 | 	r := result.Containing[0].AskalonoResult
321 | 	return askalonoLicenseHelper(r.License, r.Score)
322 | }
323 | 
324 | func askalonoLicenseHelper(input *AskalonoLicense, confidence float64) (*interfaces.Result, error) {
325 | 	if input == nil {
326 | 		return nil, fmt.Errorf("got nil license")
327 | 	}
328 | 
329 | 	license := &licenses.License{
330 | 		SPDX: input.Name,
331 | 		// TODO: populate other fields here (eg: found license text)
332 | 	}
333 | 	// FIXME: If license is not in SPDX, add a custom entry.
334 | 	if err := license.Validate(); err != nil {
335 | 		//return nil, err
336 | 		license = &licenses.License{
337 | 			//SPDX: "",
338 | 			Origin: "askalono.jpeddicord.github.com",
339 | 			Custom: input.Name,
340 | 			// TODO: populate other fields here (eg: found license text)
341 | 		}
342 | 	}
343 | 	return &interfaces.Result{
344 | 		Licenses: []*licenses.License{
345 | 			license,
346 | 		},
347 | 		Confidence: confidence,
348 | 	}, nil
349 | }
350 | 


--------------------------------------------------------------------------------
/backend/askalono/.gitignore:
--------------------------------------------------------------------------------
1 | askalono-*
2 | 


--------------------------------------------------------------------------------
/backend/askalono/askalono.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package askalono
 25 | 
 26 | import (
 27 | 	"archive/zip"
 28 | 	"bytes"
 29 | 	"crypto/sha256"
 30 | 	_ "embed"
 31 | 	"fmt"
 32 | 	"io"
 33 | 	"os"
 34 | 	"runtime"
 35 | 
 36 | 	"github.com/awslabs/yesiscan/util/errwrap"
 37 | 	"github.com/awslabs/yesiscan/util/safepath"
 38 | )
 39 | 
 40 | const (
 41 | 	// AskalonoVersion is the version string used in the git tag. These can
 42 | 	// be seen here: https://github.com/jpeddicord/askalono/releases/
 43 | 	AskalonoVersion = "0.4.6"
 44 | )
 45 | 
 46 | // AskalonoHashes maps version number, os, and then sha256sum. These are the
 47 | // hashes of the actual binaries, not the .zip files they are in. We ultimately
 48 | // care just about the integrity of the binary, so that's all we need to check.
 49 | // We don't also need to check the hash of the zip files, since we aren't
 50 | // worried about opening a zip file being dangerous.
 51 | // FIXME: We don't support different architectures for now. (eg: runtime.GOARCH)
 52 | var AskalonoHashes = map[string]map[string]string{
 53 | 	"0.4.6": {
 54 | 		"linux":   "a089146694cf433a4580c3da414cf43c70722ba6398d214fe41ca27b53deb476",
 55 | 		"darwin":  "1e006e6c61ec4abd714ae930a94b2f447c57392d621a6e8367c7aaa4cb4f427c",
 56 | 		"windows": "89f477e6e70e9bb58caf3b1f6a22fc6566e182ff81c3a920d49b6e6947ee97a1",
 57 | 	},
 58 | }
 59 | 
 60 | //go:embed askalono-0.4.6-Linux.zip
 61 | var Askalono046Linux []byte
 62 | 
 63 | //go:embed askalono-0.4.6-macOS.zip
 64 | var Askalono046macOS []byte
 65 | 
 66 | //go:embed askalono-0.4.6-Windows.zip
 67 | var Askalono046Windows []byte
 68 | 
 69 | func init() {
 70 | 	if _, err := GetExpectedHash(); err != nil {
 71 | 		panic(fmt.Sprintf("error with askalono hash lookup: %v", err))
 72 | 	}
 73 | }
 74 | 
 75 | // GetExpectedName returns the expected name of the binary for a given platform.
 76 | // This happens to also be the path it is expected to be found in the zip file
 77 | // because the packages contain that single file in the root. If this ever
 78 | // changes, then we need to add an additional GetExpectedPath method and change
 79 | // the logic.
 80 | func GetExpectedName() (safepath.RelFile, error) {
 81 | 	switch os := runtime.GOOS; os {
 82 | 	case "linux":
 83 | 		return safepath.ParseIntoRelFile("askalono")
 84 | 	case "darwin":
 85 | 		return safepath.ParseIntoRelFile("askalono")
 86 | 	case "windows":
 87 | 		return safepath.ParseIntoRelFile("askalono.exe") // lol, windows
 88 | 	default:
 89 | 		return safepath.RelFile{}, fmt.Errorf("unsupported os: %s", os)
 90 | 	}
 91 | }
 92 | 
 93 | // GetExpectedHash returns the expected hash of the binary for this version and
 94 | // OS.
 95 | func GetExpectedHash() (string, error) {
 96 | 	m, exists := AskalonoHashes[AskalonoVersion]
 97 | 	if !exists {
 98 | 		return "", fmt.Errorf("no askalono hash found for version: %s", AskalonoVersion)
 99 | 	}
100 | 	h, exists := m[runtime.GOOS]
101 | 	if !exists {
102 | 		return "", fmt.Errorf("no askalono hash found for os: %s", runtime.GOOS)
103 | 	}
104 | 	if h == "" {
105 | 		return "", fmt.Errorf("empty hash")
106 | 	}
107 | 	// the null hash, you can get this by running: `sha256sum /dev/null`
108 | 	if h == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" {
109 | 		return "", fmt.Errorf("null hash")
110 | 	}
111 | 	return h, nil
112 | }
113 | 
114 | // GetZip returns the correct zipped package for this OS and ARCH. If it doesn't
115 | // have one available, then it errors.
116 | func GetZip() ([]byte, error) {
117 | 	if arch := runtime.GOARCH; arch != "amd64" {
118 | 		return nil, fmt.Errorf("unsupported arch: %s", arch)
119 | 	}
120 | 
121 | 	var b []byte
122 | 	switch os := runtime.GOOS; os {
123 | 	case "linux":
124 | 		b = Askalono046Linux
125 | 	case "darwin":
126 | 		b = Askalono046macOS
127 | 	case "windows":
128 | 		b = Askalono046Windows
129 | 	default:
130 | 		return nil, fmt.Errorf("unsupported os: %s", os)
131 | 	}
132 | 
133 | 	if len(b) == 0 {
134 | 		// Was it built/downloaded correctly?
135 | 		return nil, fmt.Errorf("empty binary")
136 | 	}
137 | 	return b, nil
138 | }
139 | 
140 | // InstallBinary installs an askalono binary into this dir if it's not there
141 | // already or if it has the wrong hash. It then returns its extracted size, and
142 | // its complete path.
143 | func InstallBinary(absDir safepath.AbsDir) (int64, safepath.AbsFile, error) {
144 | 	// NOTE: see this comment in the docs for this function. If the way the
145 | 	// zip files is built changes, we might need to change this for a
146 | 	// GetExpectedPath function call instead.
147 | 	relFileExpected, err := GetExpectedName()
148 | 	if err != nil {
149 | 		return 0, safepath.AbsFile{}, err
150 | 	}
151 | 
152 | 	// this is where the output file will be stored
153 | 	absFile := safepath.JoinToAbsFile(absDir, relFileExpected)
154 | 
155 | 	// First check the hash of the file at this location... If it's okay,
156 | 	// then we're done early!
157 | 
158 | 	expectedHash, err := GetExpectedHash()
159 | 	if err != nil {
160 | 		// programming error, this was checked in init()
161 | 		return 0, safepath.AbsFile{}, err
162 | 	}
163 | 
164 | 	if f, err := os.Open(absFile.Path()); err != nil && !os.IsNotExist(err) {
165 | 		// serious filesystem problem
166 | 		return 0, safepath.AbsFile{}, err
167 | 	} else if err == nil {
168 | 		// check the sha256 sum
169 | 		h := sha256.New()
170 | 		if _, err := io.Copy(h, f); err != nil {
171 | 			f.Close() // close it when we exit this block
172 | 			return 0, safepath.AbsFile{}, err
173 | 		}
174 | 		f.Close() // close it when we exit this block
175 | 
176 | 		if fmt.Sprintf("%x", h.Sum(nil)) != expectedHash {
177 | 			// The expected binary destination file is invalid. So
178 | 			// delete it. We will re-write it later. This is safest.
179 | 			if err := os.Remove(absFile.Path()); err != nil {
180 | 				return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error deleting invalid file: %s", absFile.Path())
181 | 			}
182 | 		}
183 | 	}
184 | 
185 | 	b, err := GetZip()
186 | 	if err != nil {
187 | 		return 0, safepath.AbsFile{}, err
188 | 	}
189 | 
190 | 	// Open the zip archive for reading.
191 | 	// FIXME: use a variant that can take a context
192 | 	z, err := zip.NewReader(bytes.NewReader(b), int64(len(b)))
193 | 	if err != nil {
194 | 		return 0, safepath.AbsFile{}, err
195 | 	}
196 | 	//defer z.Close() // no close method exists
197 | 	if z.Comment != "" {
198 | 		//obj.Logf("zip has comment: %s", z.Comment)
199 | 	}
200 | 
201 | 	// Iterate through the files in the archive.
202 | 	// XXX: can a child directory appear before a parent?
203 | 	// TODO: add a recurring progress logf if it takes longer than 30 sec
204 | 	var x *zip.File
205 | 	for _, x = range z.File {
206 | 		// TODO: obj.Debug ?
207 | 		//obj.Logf("zip: %s", x.Name)
208 | 
209 | 		if x.FileInfo().IsDir() {
210 | 			continue
211 | 		}
212 | 
213 | 		relFile, err := safepath.ParseIntoRelFile(x.Name)
214 | 		if err != nil {
215 | 			// programming error
216 | 			return 0, safepath.AbsFile{}, err
217 | 		}
218 | 
219 | 		if relFileExpected.Cmp(relFile) == nil {
220 | 			break // found
221 | 		}
222 | 	}
223 | 	if x == nil {
224 | 		return 0, safepath.AbsFile{}, fmt.Errorf("did not file %s in zip archive", relFileExpected.Path())
225 | 	}
226 | 
227 | 	// NOTE: On the difference between absDir and absFile.Dir()... If they
228 | 	// differ, that's because the relfile has a parent relDir component.
229 | 
230 | 	// XXX: which mode method?
231 | 	if err := os.MkdirAll(absFile.Dir().Path(), os.ModePerm); err != nil {
232 | 		return 0, safepath.AbsFile{}, err
233 | 	}
234 | 
235 | 	// open the actual source file
236 | 	// we need to read this into a buffer, because this is a ReadCloser, not
237 | 	// a ReadSeekCloser. We want to make sure it passes the hash, before we
238 | 	// write it out to disk.
239 | 	f, err := x.Open()
240 | 	if err != nil {
241 | 		return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error opening file %s", x.Name)
242 | 	}
243 | 	// don't `defer` close here because we want to free in the loop
244 | 
245 | 	data, err := io.ReadAll(f)
246 | 	if err != nil {
247 | 		f.Close() // close file on error!
248 | 		return 0, safepath.AbsFile{}, err
249 | 	}
250 | 	f.Close()
251 | 
252 | 	sum := sha256.Sum256(data)
253 | 	if h := fmt.Sprintf("%x", sum); h != expectedHash {
254 | 		return 0, safepath.AbsFile{}, fmt.Errorf("unexpected askalono binary hash of: %s", h)
255 | 	}
256 | 
257 | 	// At this point, we can write out the file...
258 | 	// XXX: which mode method?
259 | 	if err := os.WriteFile(absFile.Path(), data, os.ModePerm); err != nil {
260 | 		return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile.Path())
261 | 	}
262 | 
263 | 	return int64(len(data)), absFile, nil // this is where the new binary was copied to
264 | }
265 | 


--------------------------------------------------------------------------------
/backend/bitbake.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"bufio"
 29 | 	"bytes"
 30 | 	"context"
 31 | 	"sort"
 32 | 	"strings"
 33 | 
 34 | 	"github.com/awslabs/yesiscan/interfaces"
 35 | 	"github.com/awslabs/yesiscan/util/errwrap"
 36 | 	"github.com/awslabs/yesiscan/util/licenses"
 37 | )
 38 | 
 39 | const (
 40 | 	// BitbakeMaxBytesLine sets a larger maximum for file line scanning than
 41 | 	// the default of bufio.MaxScanTokenSize which is sort of small.
 42 | 	BitbakeMaxBytesLine = 1024 * 1024 * 8 // 8 MiB
 43 | 
 44 | 	// BitbakeLicensePrefix is the string we look for when trying to find a
 45 | 	// license.
 46 | 	BitbakeLicensePrefix = `LICENSE = "`
 47 | 
 48 | 	// BitbakeLicenseSuffix is the terminating string at the end of the
 49 | 	// line. We must not include the newline here.
 50 | 	BitbakeLicenseSuffix = `"`
 51 | 
 52 | 	// BitbakeFilenameSuffix is the file extension used by the bitbake
 53 | 	// files.
 54 | 	BitbakeFilenameSuffix = ".bb"
 55 | )
 56 | 
 57 | // Bitbake is a license backend for the bitbake .bb files which are very
 58 | // commonly seen in the yocto project. We use a trivial string parser for
 59 | // finding these-- this could be improved significantly if people write fancier
 60 | // .bb files, but this should get us 99% of the way there.
 61 | type Bitbake struct {
 62 | 	Debug bool
 63 | 	Logf  func(format string, v ...interface{})
 64 | }
 65 | 
 66 | func (obj *Bitbake) String() string {
 67 | 	return "bitbake"
 68 | }
 69 | 
 70 | func (obj *Bitbake) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
 71 | 	if !strings.HasSuffix(info.FileInfo.Name(), BitbakeFilenameSuffix) {
 72 | 		return nil, nil // skip
 73 | 	}
 74 | 
 75 | 	if info.FileInfo.IsDir() {
 76 | 		return nil, nil // skip
 77 | 	}
 78 | 	if len(data) == 0 {
 79 | 		return nil, nil // skip
 80 | 	}
 81 | 
 82 | 	ctx, cancel := context.WithCancel(ctx)
 83 | 	defer cancel()
 84 | 
 85 | 	licenseMap := make(map[string]struct{})
 86 | 
 87 | 	reader := bytes.NewReader(data)
 88 | 	scanner := bufio.NewScanner(reader)
 89 | 	buf := []byte{}                          // create a buffer for very long lines
 90 | 	scanner.Buffer(buf, BitbakeMaxBytesLine) // set the max size of that buffer
 91 | 	for scanner.Scan() {
 92 | 		// In an effort to short-circuit things if needed, we run a
 93 | 		// check ourselves and break out early if we see that we have
 94 | 		// cancelled early.
 95 | 		select {
 96 | 		case <-ctx.Done():
 97 | 			return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early")
 98 | 		default:
 99 | 		}
100 | 
101 | 		s := scanner.Text() // newlines will be stripped here
102 | 		if !strings.HasPrefix(s, BitbakeLicensePrefix) {
103 | 			continue
104 | 		}
105 | 		if !strings.HasSuffix(s, BitbakeLicenseSuffix) {
106 | 			continue
107 | 		}
108 | 
109 | 		license := s[len(BitbakeLicensePrefix) : len(s)-len(BitbakeLicenseSuffix)]
110 | 		if license == "" {
111 | 			// TODO: should we warn here?
112 | 			continue
113 | 		}
114 | 
115 | 		// XXX: i've only seen & in between license strings for now...
116 | 		// example: https://git.yoctoproject.org/poky/tree/meta/recipes-devtools/btrfs-tools/btrfs-tools_5.16.2.bb#n10
117 | 		lids := strings.Split(license, "&") // lid is licenseID
118 | 		for _, x := range lids {
119 | 			lid := strings.TrimSpace(x)
120 | 			// TODO: should we normalize case here?
121 | 			licenseMap[lid] = struct{}{}
122 | 		}
123 | 	}
124 | 	var skip error
125 | 	scannerErr := scanner.Err()
126 | 	if scannerErr == bufio.ErrTooLong {
127 | 		skip = scannerErr // add to ignored files...
128 | 		scannerErr = nil  // reset
129 | 	}
130 | 
131 | 	ids := []string{}
132 | 	for id := range licenseMap {
133 | 		ids = append(ids, id)
134 | 	}
135 | 	sort.Strings(ids) // deterministic order
136 | 
137 | 	licenseList := []*licenses.License{}
138 | 
139 | 	for _, id := range ids {
140 | 		license := &licenses.License{
141 | 			SPDX: id,
142 | 			// TODO: populate other fields here?
143 | 		}
144 | 
145 | 		// If we find an unknown SPDX ID, we don't want to error,
146 | 		// because that would allow someone to put junk in their code to
147 | 		// prevent us scanning it. Instead, create an invalid license
148 | 		// but return it anyways. If we ever want to check validity, we
149 | 		// know to expect failures.
150 | 		if err := license.Validate(); err != nil {
151 | 			//return nil, err
152 | 			license = &licenses.License{
153 | 				//SPDX: "",
154 | 				Origin: "", // unknown!
155 | 				Custom: id,
156 | 				// TODO: populate other fields here (eg: found license text)
157 | 			}
158 | 		}
159 | 
160 | 		licenseList = append(licenseList, license)
161 | 	}
162 | 
163 | 	if len(licenseMap) == 0 && skip == nil {
164 | 		// NOTE: this is NOT the same as interfaces.ErrUnknownLicense
165 | 		// because in this scenario, we're comfortable (ish) the parser
166 | 		// is exhaustive at finding a license with this methodology.
167 | 		// We want to return nil, but we error only if Scanner.Err() did
168 | 		// and so normally this returns nil, nil.
169 | 		return nil, errwrap.Wrapf(scannerErr, "bitbake scanner error")
170 | 	}
171 | 
172 | 	result := &interfaces.Result{
173 | 		Licenses:   licenseList,
174 | 		Confidence: 1.0, // TODO: what should we put here?
175 | 		Skip:       skip,
176 | 	}
177 | 
178 | 	// We perform the strange task of processing any partial results, and
179 | 	// returning some even if we errored, because the spdx code seems to
180 | 	// think this is better than no results. I'll do the same, but there is
181 | 	// no guarantee the calling iterator will use these. (Currently it does
182 | 	// not!)
183 | 	return result, errwrap.Wrapf(scannerErr, "bitbake scanner error")
184 | }
185 | 


--------------------------------------------------------------------------------
/backend/cran.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"bytes"
 29 | 	"context"
 30 | 	"errors"
 31 | 	"net/mail"
 32 | 	"regexp"
 33 | 	"sort"
 34 | 	"strings"
 35 | 
 36 | 	"github.com/awslabs/yesiscan/interfaces"
 37 | 	"github.com/awslabs/yesiscan/util/errwrap"
 38 | 	"github.com/awslabs/yesiscan/util/licenses"
 39 | )
 40 | 
 41 | const (
 42 | 	// CranLicensePrefix is the string we look for when trying to find a
 43 | 	// license.
 44 | 	CranLicensePrefix = "License"
 45 | 
 46 | 	// CranFilename is the filename used by the R metadata files.
 47 | 	CranFilename = "DESCRIPTION"
 48 | )
 49 | 
 50 | var (
 51 | 	// ErrInvalidLicenseFormat is an error used in the
 52 | 	// CranDescriptionFileSubParser when licenses with invalid format are
 53 | 	// found.
 54 | 	ErrInvalidLicenseFormat = errors.New("invalid format in License(s)")
 55 | 
 56 | 	// stripTrashCran is used to replace all strings which include file and
 57 | 	// a filename and sometimes have a + or | before it. For example:
 58 | 	// "| file LICENSE". This also replaces newline characters. source:
 59 | 	// https://cran.rstudio.com/doc/manuals/r-devel/R-exts.html#Licensing
 60 | 	stripTrashCran = regexp.MustCompile(`(([+,|]?([\n ])*)file([\n ])+\w+\b([\n ])*)|\n`)
 61 | )
 62 | 
 63 | // Cran is a backend for DESCRIPTION files which store R package metadata. We
 64 | // are getting the license names from the License field in the text file.
 65 | type Cran struct {
 66 | 	Debug bool
 67 | 	Logf  func(format string, v ...interface{})
 68 | }
 69 | 
 70 | // String method returns the name of the backend.
 71 | func (obj *Cran) String() string {
 72 | 	return "cran"
 73 | }
 74 | 
 75 | // ScanData is used to extract license ids from data and return licenses based
 76 | // on the license ids.
 77 | func (obj *Cran) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
 78 | 	// This check is taking place with the assumption that the file that
 79 | 	// will be scanned will be named "DESCRIPTION".
 80 | 	if info.FileInfo.Name() != CranFilename {
 81 | 		return nil, nil // skip
 82 | 	}
 83 | 	if info.FileInfo.IsDir() {
 84 | 		return nil, nil // skip
 85 | 	}
 86 | 	if len(data) == 0 {
 87 | 		return nil, nil // skip
 88 | 	}
 89 | 
 90 | 	// Appending a newline to the data because the parser needs to have a
 91 | 	// SECOND trailing newline for it to work properly. Who knows why...
 92 | 	data = append(data, "\n"...)
 93 | 	reader := bytes.NewReader(data)
 94 | 	// Parse the DESCRIPTION file using RFC5322 which is also used for mail.
 95 | 	parsed, err := mail.ReadMessage(reader)
 96 | 	if err != nil {
 97 | 		return nil, errwrap.Wrapf(err, "parse error")
 98 | 	}
 99 | 
100 | 	// Getting license information from License field.
101 | 	cranlicenseFields, ok := parsed.Header[CranLicensePrefix]
102 | 	if !ok {
103 | 		// This would mean we did not have a License field in the
104 | 		// DESCRIPTION file.
105 | 		return nil, nil
106 | 	}
107 | 	licenseMap := make(map[string]struct{})
108 | 	var subErr error
109 | 	for _, license := range cranlicenseFields {
110 | 		lids, err := CranDescriptionFileSubParser(license) // lid is licenseID
111 | 		if err != nil {
112 | 			subErr = errwrap.Append(subErr, err) // store for later
113 | 		}
114 | 		// Our parser might have partial results even when it errors.
115 | 		for _, lid := range lids {
116 | 			// TODO: should we normalize case here?
117 | 			licenseMap[lid] = struct{}{}
118 | 		}
119 | 	}
120 | 
121 | 	ids := []string{}
122 | 	for id := range licenseMap {
123 | 		ids = append(ids, id)
124 | 	}
125 | 	sort.Strings(ids) // deterministic order
126 | 
127 | 	licenseList := []*licenses.License{}
128 | 
129 | 	for _, id := range ids {
130 | 		license := &licenses.License{
131 | 			SPDX: id,
132 | 			// TODO: populate other fields here?
133 | 		}
134 | 
135 | 		// If we find an unknown SPDX ID, we don't want to error,
136 | 		// because that would allow someone to put junk in their code to
137 | 		// prevent us scanning it. Instead, create an invalid license
138 | 		// but return it anyways. If we ever want to check validity, we
139 | 		// know to expect failures.
140 | 		// XXX: Some Cran licenses are not SPDX, therefore we might want
141 | 		// to add an alias matcher in the future.
142 | 		if err := license.Validate(); err != nil {
143 | 			//return nil, err
144 | 			license = &licenses.License{
145 | 				//SPDX: "",
146 | 				Origin: "", // unknown!
147 | 				Custom: id,
148 | 				// TODO: populate other fields here
149 | 				// (eg: found license text)
150 | 			}
151 | 		}
152 | 
153 | 		licenseList = append(licenseList, license)
154 | 	}
155 | 
156 | 	// We return any partial results, and even if we errored, because we can
157 | 	// now notify the user of these issues separately.
158 | 	result := &interfaces.Result{
159 | 		Licenses:   licenseList,
160 | 		Confidence: 1.0, // TODO: what should we put here?
161 | 		Skip:       errwrap.Wrapf(subErr, "cran sub-parser error"),
162 | 	}
163 | 
164 | 	return result, nil
165 | }
166 | 
167 | // CranDescriptionFileSubParser is used to parse the License field in
168 | // DESCRIPTION files.
169 | func CranDescriptionFileSubParser(input string) ([]string, error) {
170 | 	if input == "" {
171 | 		return nil, ErrInvalidLicenseFormat
172 | 	}
173 | 	// Removing all files and new line characters from input.
174 | 	input = stripTrashCran.ReplaceAllString(input, "")
175 | 	if input == "" {
176 | 		// We are returning nil, nil here because the input only
177 | 		// consisted of files for Licenses.
178 | 		return nil, nil
179 | 	}
180 | 	var result []string
181 | 	var err error
182 | 	// TODO: I have only seen | in between license strings for now. source:
183 | 	// https://cran.rstudio.com/doc/manuals/r-devel/R-exts.html#Licensing
184 | 	listLicenseNames := strings.Split(input, "|")
185 | 	for _, x := range listLicenseNames {
186 | 		license := strings.TrimSpace(x)
187 | 		if license == "" {
188 | 			err = ErrInvalidLicenseFormat
189 | 			continue
190 | 		}
191 | 		result = append(result, license)
192 | 	}
193 | 	return result, err
194 | }
195 | 


--------------------------------------------------------------------------------
/backend/cran_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | 
 26 | package backend_test
 27 | 
 28 | import (
 29 | 	"context"
 30 | 	"fmt"
 31 | 	"io/fs"
 32 | 	"io/ioutil"
 33 | 	"os"
 34 | 	"path/filepath"
 35 | 	"reflect"
 36 | 	"strings"
 37 | 	"testing"
 38 | 	"time"
 39 | 
 40 | 	"github.com/awslabs/yesiscan/backend"
 41 | 	"github.com/awslabs/yesiscan/interfaces"
 42 | 	"github.com/awslabs/yesiscan/iterator"
 43 | 	"github.com/awslabs/yesiscan/util/licenses"
 44 | )
 45 | 
 46 | // cranFileInfo struct helps make any input file to be DESCRIPTION files.
 47 | type cranFileInfo struct {
 48 | 	fileInfo fs.FileInfo
 49 | }
 50 | 
 51 | func (obj *cranFileInfo) Name() string       { return backend.CranFilename }
 52 | func (obj *cranFileInfo) Size() int64        { return obj.fileInfo.Size() }
 53 | func (obj *cranFileInfo) Mode() fs.FileMode  { return obj.fileInfo.Mode() }
 54 | func (obj *cranFileInfo) ModTime() time.Time { return obj.fileInfo.ModTime() }
 55 | func (obj *cranFileInfo) IsDir() bool        { return obj.fileInfo.IsDir() }
 56 | func (obj *cranFileInfo) Sys() interface{}   { return obj.fileInfo.Sys() }
 57 | 
 58 | // CranDescriptionFileSubParser parses the string in the License field to get
 59 | // License names from DESCRIPTION files. If no Licenses are found nil is
 60 | // returned and any files mentioned are ignored by the parser.
 61 | func TestCranDescriptionFileSubParser(t *testing.T) {
 62 | 	errVal := backend.ErrInvalidLicenseFormat
 63 | 	tests := []struct {
 64 | 		input  string
 65 | 		output []string
 66 | 		err    error
 67 | 	}{
 68 | 		{"", nil, errVal},
 69 | 		{"||||||", nil, errVal},
 70 | 		{"++--###", []string{"++--###"}, nil},
 71 | 		{"file LICENSE", nil, nil},
 72 | 		{"file any", nil, nil},
 73 | 		{"MIT + file LICENSE", []string{"MIT"}, nil},
 74 | 		{"MIT + file LICENSE | file LICENSE", []string{"MIT"}, nil},
 75 | 		{"Artistic-2.0 | AGPL-3 + file LICENSE", []string{"Artistic-2.0", "AGPL-3"}, nil},
 76 | 		{"GPL-2 | \n file LICENSE", []string{"GPL-2"}, nil},
 77 | 		{"MIT + file LICENSE | file LICENSE | AGPL-3 + file anything", []string{"MIT", "AGPL-3"}, nil},
 78 | 		{"Artistic-2.0 | AGPL-3 + file any | MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil},
 79 | 		{"Artistic-2.0 | | MIT +file LICENSE", []string{"Artistic-2.0", "MIT"}, errVal},
 80 | 		{"Artistic-2.0 | \n AGPL-3 + file  any | \n MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil},
 81 | 		{"Artistic-2.0 | \n AGPL-3 \n + file any | \n MI\nT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil},
 82 | 		{"Artistic-2.0 | \n AGPL-3 \n + file any | -+-+##& | \n MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "-+-+##&", "MIT"}, nil},
 83 | 	}
 84 | 
 85 | 	for i, test := range tests {
 86 | 		out, err := backend.CranDescriptionFileSubParser(test.input)
 87 | 		if err != test.err {
 88 | 			t.Errorf("err: %v, exp err: %v", err, test.err)
 89 | 			continue
 90 | 		}
 91 | 		if !reflect.DeepEqual(out, test.output) {
 92 | 			t.Errorf("out: %v, exp out: %v", out, test.output)
 93 | 			continue
 94 | 		}
 95 | 		t.Logf("test# %d succeeded!", i)
 96 | 	}
 97 | }
 98 | 
 99 | // TestCranBackend tests whether the cran backend runs as intended.
100 | func TestCranBackend(t *testing.T) {
101 | 	inputfilePaths, err := filepath.Glob("./cran_test_cases/*.input")
102 | 	if err != nil {
103 | 		t.Errorf("error getting input files: %v", err)
104 | 		return
105 | 	}
106 | 	cranBackend := &backend.Cran{
107 | 		Debug: false,
108 | 		Logf: func(format string, v ...interface{}) {
109 | 			t.Logf("backend: "+format, v...)
110 | 		},
111 | 	}
112 | 	for _, path := range inputfilePaths {
113 | 		inputFileInfo, err := os.Stat(path)
114 | 		if err != nil {
115 | 			t.Errorf("error getting FileInfo: %v", err)
116 | 			continue
117 | 		}
118 | 		data, err := ioutil.ReadFile(path)
119 | 		if err != nil {
120 | 			t.Errorf("error reading input file: %v", err)
121 | 			continue
122 | 		}
123 | 		fileInfo := &cranFileInfo{
124 | 			fileInfo: inputFileInfo,
125 | 		}
126 | 		info := &interfaces.Info{
127 | 			FileInfo: fileInfo,
128 | 			UID:      iterator.FileScheme + path,
129 | 		}
130 | 
131 | 		outputFilePath := strings.TrimSuffix(path, ".input") + ".output"
132 | 		errorFilePath := strings.TrimSuffix(path, ".input") + ".error"
133 | 		// TODO: if there is no error file, assume we expect no error
134 | 		outputContents, outputFileErr := ioutil.ReadFile(outputFilePath)
135 | 		if outputFileErr != nil {
136 | 			t.Errorf("error reading output file: %v", outputFileErr)
137 | 		}
138 | 		errorContents, errorFileErr := ioutil.ReadFile(errorFilePath)
139 | 		if errorFileErr != nil {
140 | 			t.Errorf("error reading error file: %v", errorFileErr)
141 | 		}
142 | 		if outputFileErr != nil || errorFileErr != nil {
143 | 			// give both statements a chance to tell us what's
144 | 			// missing before we go on to the next test case
145 | 			continue
146 | 		}
147 | 
148 | 		expOut := strings.TrimSuffix(string(outputContents), "\n")
149 | 		var expErr error
150 | 		if s := strings.TrimSuffix(string(errorContents), "\n"); s != "" {
151 | 			expErr = fmt.Errorf(s)
152 | 		}
153 | 
154 | 		result, err := cranBackend.ScanData(context.Background(), data, info)
155 | 		if (err == nil) != (expErr == nil) { // xor
156 | 			t.Errorf("filename: %v, err: %v", path, err)
157 | 			t.Errorf("filename: %v, exp: %v", path, expErr)
158 | 			continue
159 | 		}
160 | 		if err != nil && expErr != nil {
161 | 			if err.Error() != expErr.Error() { // compare the strings
162 | 				t.Errorf("filename: %v, err: %v", path, err)
163 | 				t.Errorf("filename: %v, exp: %v", path, expErr)
164 | 				continue
165 | 			}
166 | 		}
167 | 
168 | 		var out string
169 | 		if result != nil {
170 | 			out = licenses.Join(result.Licenses)
171 | 		}
172 | 		if out != expOut {
173 | 			t.Errorf("filename: %v, out: %v", path, out)
174 | 			t.Errorf("filename: %v, exp: %v", path, expOut)
175 | 			continue
176 | 		}
177 | 
178 | 		t.Logf("Success!")
179 | 	}
180 | }
181 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case0.error:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case0.input:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | Depends:
 4 | 	R (>= 3.4)
 5 | Imports:
 6 | 	cli,
 7 | 	utils
 8 | Suggests:
 9 | 	callr,
10 | 	covr
11 | Config/testthat/edition: 3
12 | Encoding: UTF-8
13 | Language: en-US
14 | Collate:
15 | 	'assertions.R'
16 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case0.output:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case1.error:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case1.input:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | License: MIT +
 4 | 	file
 5 | 	LICENSE | GPL-2.0
 6 | License: Apache-2.0
 7 | Depends:
 8 | 	R (>= 3.4)
 9 | Imports:
10 | 	cli,
11 | 	utils
12 | Suggests:
13 | 	callr,
14 | 	covr
15 | Config/testthat/edition: 3
16 | Encoding: UTF-8
17 | Language: en-US
18 | Collate:
19 | 	'assertions.R'
20 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case1.output:
--------------------------------------------------------------------------------
1 | Apache-2.0, GPL-2.0, MIT
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case2.error:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case2.input:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | License: Apache-2
 4 | Depends:
 5 | 	R (>= 3.4)
 6 | Imports:
 7 | 	cli,
 8 | 	utils
 9 | Suggests:
10 | 	callr,
11 | 	covr
12 | Config/testthat/edition: 3
13 | Encoding: UTF-8
14 | Language: en-US
15 | Collate:
16 | 	'assertions.R'
17 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case2.output:
--------------------------------------------------------------------------------
1 | Apache-2(unknown)
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case3.error:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case3.input:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/backend/cran_test_cases/test_case3.input


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case3.output:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case4.error:
--------------------------------------------------------------------------------
1 | cran sub-parser error: invalid format in License(s)
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case4.input:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | License: MIT + 
 4 | 	file
 5 | 	LICENSE | GPL-2.0
 6 | License:
 7 | Depends:
 8 | 	R (>= 3.4)
 9 | Imports:
10 | 	cli,
11 | 	utils
12 | Suggests:
13 | 	callr,
14 | 	covr
15 | Config/testthat/edition: 3
16 | Encoding: UTF-8
17 | Language: en-US
18 | Collate:
19 | 	'assertions.R'
20 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case4.output:
--------------------------------------------------------------------------------
1 | GPL-2.0, MIT
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case5.error:
--------------------------------------------------------------------------------
1 | cran sub-parser error: invalid format in License(s)
2 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case5.input:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | License:
 4 | Depends:
 5 | 	R (>= 3.4)
 6 | Imports:
 7 | 	cli,
 8 | 	utils
 9 | Suggests:
10 | 	callr,
11 | 	covr
12 | Config/testthat/edition: 3
13 | Encoding: UTF-8
14 | Language: en-US
15 | Collate:
16 | 	'assertions.R'
17 | 


--------------------------------------------------------------------------------
/backend/cran_test_cases/test_case5.output:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/backend/licenseclassifier.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"context"
 29 | 	"fmt"
 30 | 	"sort"
 31 | 
 32 | 	"github.com/awslabs/yesiscan/interfaces"
 33 | 	"github.com/awslabs/yesiscan/util/errwrap"
 34 | 	"github.com/awslabs/yesiscan/util/licenses"
 35 | 	"github.com/awslabs/yesiscan/util/safepath"
 36 | 
 37 | 	"github.com/google/licenseclassifier"
 38 | 	"github.com/google/licenseclassifier/tools/identify_license/backend"
 39 | 	"github.com/google/licenseclassifier/tools/identify_license/results"
 40 | )
 41 | 
 42 | // LicenseClassifier is based on the licenseclassifier project.
 43 | type LicenseClassifier struct {
 44 | 	// This was chosen as it's easier to have the first backend be based on
 45 | 	// a native golang project, rather than having to play the exec games
 46 | 	// right away. Some code within is based on their cli code that wraps
 47 | 	// their lib.
 48 | 
 49 | 	Debug bool
 50 | 	Logf  func(format string, v ...interface{})
 51 | 
 52 | 	// XXX: also match with .header files
 53 | 	// XXX: what default value do we want here?
 54 | 	// XXX: what exactly does this do?
 55 | 	IncludeHeaders bool
 56 | 
 57 | 	// UseDefaultConfidence specifies whether we should use the default
 58 | 	// confidence threshold that this library seems to use all the time.
 59 | 	// I've noticed that without it, it misidentifies a lot of things. But
 60 | 	// with it, it misses some things entirely, even if it incorrectly
 61 | 	// identifies them.
 62 | 	UseDefaultConfidence bool
 63 | 
 64 | 	// SkipZeroResults tells this backend to avoid erroring when we aren't
 65 | 	// able to determine if a file matches a known license. Since this
 66 | 	// particular backend is not good at general file identification, and
 67 | 	// only good at being presented with actual licenses, this is useful if
 68 | 	// file filtering is not enabled.
 69 | 	SkipZeroResults bool
 70 | }
 71 | 
 72 | func (obj *LicenseClassifier) String() string {
 73 | 	return "licenseclassifier"
 74 | }
 75 | 
 76 | func (obj *LicenseClassifier) ScanPath(ctx context.Context, path safepath.Path, info *interfaces.Info) (*interfaces.Result, error) {
 77 | 
 78 | 	if info.FileInfo.IsDir() { // path.IsDir() should be the same.
 79 | 		return nil, nil // skip
 80 | 	}
 81 | 	if info.FileInfo.Size() == 0 {
 82 | 		return nil, nil // skip
 83 | 	}
 84 | 
 85 | 	filenames := []string{path.Path()}
 86 | 
 87 | 	threshold := 0.0 // we decide acceptability downstream
 88 | 	if obj.UseDefaultConfidence {
 89 | 		threshold = licenseclassifier.DefaultConfidenceThreshold
 90 | 	}
 91 | 	forbiddenOnly := true // identify using forbidden licenses archive
 92 | 	be, err := backend.New(threshold, forbiddenOnly)
 93 | 	if err != nil {
 94 | 		be.Close()
 95 | 		return nil, errwrap.Wrapf(err, "cannot create license classifier")
 96 | 	}
 97 | 
 98 | 	// XXX: bug: https://github.com/google/licenseclassifier/issues/28
 99 | 	ctx, cancel := context.WithCancel(ctx)
100 | 	defer cancel()
101 | 	if errs := be.ClassifyLicensesWithContext(ctx, filenames, obj.IncludeHeaders); errs != nil {
102 | 		be.Close()
103 | 		for _, err := range errs {
104 | 			if obj.Debug {
105 | 				obj.Logf("classify license failed: %v", err)
106 | 			}
107 | 		}
108 | 		return nil, fmt.Errorf("cannot classify licenses")
109 | 	}
110 | 
111 | 	results := be.GetResults()
112 | 	if len(results) == 0 {
113 | 		be.Close()
114 | 		return nil, nil
115 | 	}
116 | 
117 | 	sort.Sort(results)
118 | 	// A match identifies the result of matching a string against a known value.
119 | 	// Name		string	// Name of known value that was matched.
120 | 	// Confidence	float64	// Confidence percentage.
121 | 	// Offset	int	// The offset into the unknown string the match was made.
122 | 	// Extent	int	// The length from the offset into the unknown string.
123 | 	//for _, r := range results {
124 | 	//	log.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)",
125 | 	//		r.Filename, r.Name, r.Confidence, r.Offset, r.Extent)
126 | 	//	// licenses/AGPL-3.0.txt: AGPL-3.0 (confidence: 0.9999677086024283, offset: 0, extent: 30968)
127 | 	//}
128 | 	be.Close()
129 | 	// This can give us multiple results, sorted by most confident.
130 | 	result, err := licenseclassifierResultHelper(results[0])
131 | 	if err != nil {
132 | 		return nil, err
133 | 	}
134 | 
135 | 	// Add more info about the others possibilities to the result.
136 | 	more := []*interfaces.Result{}
137 | 	for i := 1; i < len(results); i++ {
138 | 		r, err := licenseclassifierResultHelper(results[i])
139 | 		if err != nil {
140 | 			return nil, err
141 | 		}
142 | 		more = append(more, r)
143 | 	}
144 | 	if len(more) > 0 {
145 | 		result.More = more
146 | 	}
147 | 
148 | 	return result, nil
149 | }
150 | 
151 | func licenseclassifierResultHelper(result *results.LicenseType) (*interfaces.Result, error) {
152 | 	if result == nil {
153 | 		return nil, fmt.Errorf("got nil result")
154 | 	}
155 | 
156 | 	// XXX: This backend seems to return names that aren't valid SPDX ID's.
157 | 	// It's also not necessarily guaranteed that the SPDX ID's they do
158 | 	// return correspond to the exact same license texts that we expect. We
159 | 	// need to (1) ensure the mapping is the same, and (2) check when one of
160 | 	// these licenses is not in our SPDX list, and tag it separately.
161 | 	license := &licenses.License{
162 | 		SPDX: result.Name,
163 | 		// TODO: populate other fields here (eg: found license text)
164 | 	}
165 | 	// FIXME: If license is not in SPDX, add a custom entry.
166 | 	// FIXME: https://github.com/google/licenseclassifier/issues/31
167 | 	if err := license.Validate(); err != nil {
168 | 		//return nil, err
169 | 		license = &licenses.License{
170 | 			//SPDX: "",
171 | 			Origin: "licenseclassifier.google.github.com",
172 | 			Custom: result.Name,
173 | 			// TODO: populate other fields here (eg: found license text)
174 | 		}
175 | 	}
176 | 	return &interfaces.Result{
177 | 		Licenses: []*licenses.License{
178 | 			license,
179 | 		},
180 | 		Confidence: result.Confidence,
181 | 	}, nil
182 | }
183 | 


--------------------------------------------------------------------------------
/backend/pom.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"context"
 29 | 	"encoding/xml"
 30 | 	"sort"
 31 | 
 32 | 	"github.com/awslabs/yesiscan/interfaces"
 33 | 	"github.com/awslabs/yesiscan/util/errwrap"
 34 | 	"github.com/awslabs/yesiscan/util/licenses"
 35 | )
 36 | 
 37 | const (
 38 | 	// PomFilename is the file name used by the pomfiles.
 39 | 	PomFilename = "pom.xml"
 40 | )
 41 | 
 42 | // Pom is a backend for Pom or Project Object Model files. It is an xml file
 43 | // commonly used by the Maven Project under the name pom.xml. We are getting the
 44 | // license names by parsing the pom.xml file.
 45 | type Pom struct {
 46 | 	Debug bool
 47 | 	Logf  func(format string, v ...interface{})
 48 | }
 49 | 
 50 | // String method returns the name of the backend.
 51 | func (obj *Pom) String() string {
 52 | 	return "pom"
 53 | }
 54 | 
 55 | // ScanData method is used to extract license ids from data and return licenses
 56 | // based on the license ids.
 57 | func (obj *Pom) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
 58 | 	// This check is taking place with the assumption that the file that will be
 59 | 	// scanned will have to be named "pom.xml".
 60 | 	if info.FileInfo.Name() != PomFilename {
 61 | 		return nil, nil // skip
 62 | 	}
 63 | 	if info.FileInfo.IsDir() {
 64 | 		return nil, nil // skip
 65 | 	}
 66 | 	if len(data) == 0 {
 67 | 		return nil, nil // skip
 68 | 	}
 69 | 
 70 | 	licenseMap := make(map[string]struct{})
 71 | 	var pomFileLicenses PomLicenses
 72 | 
 73 | 	// parsing pom.xml file to get license names in struct
 74 | 	if err := xml.Unmarshal(data, &pomFileLicenses); err != nil {
 75 | 		// There is a parse error with the file, so we can't properly
 76 | 		// examine it for licensing information with this pom scanner.
 77 | 		result := &interfaces.Result{
 78 | 			Confidence: 1.0, // TODO: what should we put here?
 79 | 			Skip:       errwrap.Wrapf(err, "parse error"),
 80 | 		}
 81 | 		return result, nil
 82 | 	}
 83 | 
 84 | 	if len(pomFileLicenses.Names) == 0 {
 85 | 		// If we did not get any license names from the pom file we return nil, nil.
 86 | 		return nil, nil
 87 | 	}
 88 | 
 89 | 	// lid is license id
 90 | 	for _, lid := range pomFileLicenses.Names {
 91 | 		licenseMap[lid] = struct{}{}
 92 | 	}
 93 | 
 94 | 	ids := []string{}
 95 | 	for id := range licenseMap {
 96 | 		ids = append(ids, id)
 97 | 	}
 98 | 	sort.Strings(ids) // deterministic order
 99 | 
100 | 	licenseList := []*licenses.License{}
101 | 
102 | 	for _, id := range ids {
103 | 		license := &licenses.License{
104 | 			SPDX: id,
105 | 			// TODO: populate other fields here?
106 | 		}
107 | 
108 | 		// If we find an unknown SPDX ID, we don't want to error, because that would
109 | 		// allow someone to put junk in their code to prevent us scanning it. Instead,
110 | 		// create an invalid license but return it anyways. If we ever want to check
111 | 		// validity, we know to expect failures.
112 | 		// XXX: Many Pom licenses are not SPDX, therefore we might want to add an alias
113 | 		// matcher in the future.
114 | 		if err := license.Validate(); err != nil {
115 | 			//return nil, err
116 | 			license = &licenses.License{
117 | 				//SPDX: "",
118 | 				Origin: "", // unknown!
119 | 				Custom: id,
120 | 				// TODO: populate other fields here (eg: found license text)
121 | 			}
122 | 		}
123 | 
124 | 		licenseList = append(licenseList, license)
125 | 	}
126 | 
127 | 	result := &interfaces.Result{
128 | 		Licenses:   licenseList,
129 | 		Confidence: 1.0, // TODO: what should we put here?
130 | 	}
131 | 
132 | 	return result, nil
133 | }
134 | 
135 | // PomLicenses is a struct that helps store license names from the licenses
136 | // field in a pom.xml file.
137 | type PomLicenses struct {
138 | 	// Names is a variable that will store the license names from pom.xml.
139 | 	Names []string `xml:"licenses>license>name"`
140 | }
141 | 


--------------------------------------------------------------------------------
/backend/regexp.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | // TODO: should this be a subpackage?
25 | package backend
26 | 
27 | import (
28 | 	"bytes"
29 | 	"context"
30 | 	"encoding/json"
31 | 	"fmt"
32 | 	"os"
33 | 
34 | 	"github.com/awslabs/yesiscan/interfaces"
35 | 	"github.com/awslabs/yesiscan/util/errwrap"
36 | )
37 | 
38 | // Regexp is a simple backend that uses regular expressions to find certain
39 | // license strings. It wraps the RegexpCore backend and adds the file input
40 | // code.
41 | type Regexp struct {
42 | 	*RegexpCore
43 | 
44 | 	// Filename is an absolute path to a file that we will read the patterns
45 | 	// from. The struct is described below and an example is available in
46 | 	// the examples folder.
47 | 	Filename string
48 | }
49 | 
50 | func (obj *Regexp) String() string {
51 | 	return obj.RegexpCore.String()
52 | }
53 | 
54 | func (obj *Regexp) Setup(ctx context.Context) error {
55 | 	b, err := os.ReadFile(obj.Filename)
56 | 	if err != nil {
57 | 		// TODO: this error message is CLI specific, but should be generalized
58 | 		obj.Logf("either run with --no-backend-regexp or create your regexp pattern file at %s", obj.Filename)
59 | 		return errwrap.Wrapf(err, "could not read config file: %s", obj.Filename)
60 | 	}
61 | 
62 | 	buffer := bytes.NewBuffer(b)
63 | 	if buffer.Len() == 0 {
64 | 		// TODO: should this be an error, or just a silent ignore?
65 | 		return fmt.Errorf("empty input file")
66 | 	}
67 | 	decoder := json.NewDecoder(buffer)
68 | 
69 | 	var regexpConfig RegexpConfig // this gets populated during decode
70 | 	if err := decoder.Decode(&regexpConfig); err != nil {
71 | 		return errwrap.Wrapf(err, "error decoding regexp json output")
72 | 	}
73 | 
74 | 	obj.RegexpCore.Rules = regexpConfig.Rules
75 | 	obj.RegexpCore.Origin = regexpConfig.Origin
76 | 
77 | 	return obj.RegexpCore.Setup(ctx)
78 | }
79 | 
80 | func (obj *Regexp) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
81 | 	return obj.RegexpCore.ScanData(ctx, data, info)
82 | }
83 | 
84 | // RegexpConfig is the structure of the pattern config file.
85 | type RegexpConfig struct {
86 | 	// Rules is the list of regexp and license id rules.
87 | 	Rules []*RegexpLicenseRule `json:"rules"`
88 | 
89 | 	// Origin is the SPDX origin string if we want to have a custom
90 | 	// namespace for non-SPDX license ID's.
91 | 	Origin string `json:"origin"`
92 | 
93 | 	// Comment adds a user friendly comment for this file. We could use it
94 | 	// to add a version string or maybe that could be a separate field.
95 | 	Comment string `json:"comment"`
96 | }
97 | 


--------------------------------------------------------------------------------
/backend/regexpcore.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"bufio"
 29 | 	"bytes"
 30 | 	"context"
 31 | 	"regexp"
 32 | 	"sort"
 33 | 	"strings"
 34 | 
 35 | 	"github.com/awslabs/yesiscan/interfaces"
 36 | 	"github.com/awslabs/yesiscan/util/errwrap"
 37 | 	"github.com/awslabs/yesiscan/util/licenses"
 38 | )
 39 | 
 40 | const (
 41 | 	// RegexpMaxBytesLine sets a larger maximum for file line scanning than
 42 | 	// the default of bufio.MaxScanTokenSize which is sort of small.
 43 | 	RegexpMaxBytesLine = 1024 * 1024 * 8 // 8 MiB
 44 | )
 45 | 
 46 | // RegexpCore is a simple backend that uses regular expressions to find certain
 47 | // license strings. You should probably not use this backend directly, but wrap
 48 | // it with one of the other ones like Regexp.
 49 | type RegexpCore struct {
 50 | 	Debug bool
 51 | 	Logf  func(format string, v ...interface{})
 52 | 
 53 | 	// Rules is a list of regexp license rules.
 54 | 	Rules []*RegexpLicenseRule
 55 | 
 56 | 	// Origin is the license field origin which is used if a non-SPDX ID is
 57 | 	// specified. You can use this blank if you want. These are commonly
 58 | 	// expressed in "reverse-dns" notation to provide a unique identifier
 59 | 	// when naming your license. Eg: "yesiscan.awslabs.github.com".
 60 | 	Origin string
 61 | 
 62 | 	// MultipleMatch is set to true if you want the same regexp to be
 63 | 	// allowed to match more than once in the same file. This is useful if
 64 | 	// you want to be able to pull out every range where the pattern is
 65 | 	// seen, even if you will keep getting the same license answer. Most of
 66 | 	// the time you probably want to leave this as false.
 67 | 	MultipleMatch bool
 68 | 
 69 | 	// compiledRegexps is compiled list of the above Rules field. This is
 70 | 	// done for performance reasons.
 71 | 	compiledRegexps []*regexp.Regexp
 72 | }
 73 | 
 74 | func (obj *RegexpCore) String() string {
 75 | 	return "regexp"
 76 | }
 77 | 
 78 | func (obj *RegexpCore) Setup(ctx context.Context) error {
 79 | 	for i, x := range obj.Rules {
 80 | 		r, err := regexp.Compile(x.Pattern)
 81 | 		if err != nil {
 82 | 			return errwrap.Wrapf(err, "regexp compile failed at index: %d", i)
 83 | 		}
 84 | 		obj.compiledRegexps = append(obj.compiledRegexps, r)
 85 | 	}
 86 | 
 87 | 	return nil
 88 | }
 89 | 
 90 | func (obj *RegexpCore) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
 91 | 	if info.FileInfo.IsDir() {
 92 | 		return nil, nil // skip
 93 | 	}
 94 | 	if len(data) == 0 {
 95 | 		return nil, nil // skip
 96 | 	}
 97 | 
 98 | 	ctx, cancel := context.WithCancel(ctx)
 99 | 	defer cancel()
100 | 
101 | 	licenseMap := make(map[string]struct{})
102 | 
103 | 	reader := bytes.NewReader(data)
104 | 	scanner := bufio.NewScanner(reader)
105 | 	buf := []byte{}                         // create a buffer for very long lines
106 | 	scanner.Buffer(buf, RegexpMaxBytesLine) // set the max size of that buffer
107 | 	for scanner.Scan() {
108 | 		// In an effort to short-circuit things if needed, we run a
109 | 		// check ourselves and break out early if we see that we have
110 | 		// cancelled early.
111 | 		select {
112 | 		case <-ctx.Done():
113 | 			return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early")
114 | 		default:
115 | 		}
116 | 
117 | 		s := scanner.Text() // newlines will be stripped here
118 | 		s = strings.TrimSpace(s)
119 | 		if s == "" {
120 | 			continue
121 | 		}
122 | 
123 | 		for i, r := range obj.compiledRegexps {
124 | 			loc := r.FindStringIndex(s) // (loc []int)
125 | 			if loc == nil {             // no match
126 | 				continue
127 | 			}
128 | 			if obj.Debug {
129 | 				obj.Logf("matched: %s", string(s[loc[0]:loc[1]]))
130 | 			}
131 | 
132 | 			lid := obj.Rules[i].ID
133 | 			// TODO: replace this with a generic license parser and
134 | 			// alias matcher.
135 | 			split := strings.Split(lid, " AND ")
136 | 			for _, l := range split {
137 | 				l = strings.TrimSpace(l)
138 | 				licenseMap[l] = struct{}{}
139 | 			}
140 | 			if !obj.MultipleMatch {
141 | 				break // just break this inner loop
142 | 			}
143 | 		}
144 | 	}
145 | 	var skip error
146 | 	scannerErr := scanner.Err()
147 | 	if scannerErr == bufio.ErrTooLong {
148 | 		skip = scannerErr // add to ignored files...
149 | 		scannerErr = nil  // reset
150 | 	}
151 | 
152 | 	ids := []string{}
153 | 	for id := range licenseMap {
154 | 		ids = append(ids, id)
155 | 	}
156 | 	sort.Strings(ids) // deterministic order
157 | 
158 | 	licenseList := []*licenses.License{}
159 | 
160 | 	for _, id := range ids {
161 | 		license := &licenses.License{
162 | 			SPDX: id,
163 | 			// TODO: populate other fields here?
164 | 		}
165 | 
166 | 		// If we find an unknown SPDX ID, we don't want to error,
167 | 		// because that would allow someone to put junk in their code to
168 | 		// prevent us scanning it. Instead, create an invalid license
169 | 		// but return it anyways. If we ever want to check validity, we
170 | 		// know to expect failures. It *must* be valid because it's an
171 | 		// explicit SPDX scanner.
172 | 		if err := license.Validate(); err != nil {
173 | 			//return nil, err
174 | 			license = &licenses.License{
175 | 				//SPDX: "",
176 | 				Origin: obj.Origin,
177 | 				Custom: id,
178 | 				// TODO: populate other fields here (eg: found license text)
179 | 			}
180 | 		}
181 | 
182 | 		licenseList = append(licenseList, license)
183 | 	}
184 | 
185 | 	if len(licenseMap) == 0 && skip == nil {
186 | 		// NOTE: this is NOT the same as interfaces.ErrUnknownLicense
187 | 		// because in this scenario, we're comfortable (ish) the parser
188 | 		// is exhaustive at finding a license with this methodology.
189 | 		// We want to return nil, but we error only if Scanner.Err() did
190 | 		// and so normally this returns nil, nil.
191 | 		return nil, errwrap.Wrapf(scannerErr, "regexp scanner error")
192 | 	}
193 | 
194 | 	result := &interfaces.Result{
195 | 		Licenses:   licenseList,
196 | 		Confidence: 1.0, // TODO: what should we put here?
197 | 		Skip:       skip,
198 | 	}
199 | 
200 | 	// We perform the strange task of processing any partial results, and
201 | 	// returning some even if we errored, because the spdx code seems to
202 | 	// think this is better than no results. I'll do the same, but there is
203 | 	// no guarantee the calling iterator will use these. (Currently it does
204 | 	// not!)
205 | 	return result, errwrap.Wrapf(scannerErr, "regexp scanner error")
206 | }
207 | 
208 | // RegexpLicenseRule represents the data required for a regexp license rule.
209 | // Reminder, you can use backticks to quote golang strings, which is
210 | // particularly helpful when entering regular expressions into structs.
211 | type RegexpLicenseRule struct {
212 | 	// Pattern is the expression we want to match. This uses the stock
213 | 	// golang regexp engine.
214 | 	Pattern string `json:"pattern"`
215 | 
216 | 	// ID is the license ID we should use when the above pattern matches. It
217 | 	// should be an SPDX ID, but other strings are supported, they just
218 | 	// won't be treated as SPDX if they aren't in our database of allowed
219 | 	// license identifiers.
220 | 	ID string `json:"id"`
221 | 
222 | 	// TODO: add a comment field?
223 | 	//Comment string `json:"comment"`
224 | }
225 | 


--------------------------------------------------------------------------------
/backend/spdx.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // TODO: should this be a subpackage?
 25 | package backend
 26 | 
 27 | import (
 28 | 	"bufio"
 29 | 	"bytes"
 30 | 	"context"
 31 | 	"regexp"
 32 | 	"sort"
 33 | 	"strings"
 34 | 
 35 | 	"github.com/awslabs/yesiscan/interfaces"
 36 | 	"github.com/awslabs/yesiscan/util/errwrap"
 37 | 	"github.com/awslabs/yesiscan/util/licenses"
 38 | )
 39 | 
 40 | const (
 41 | 	// SpdxMaxBytesLine sets a larger maximum for file line scanning than
 42 | 	// the default of bufio.MaxScanTokenSize which is sort of small.
 43 | 	SpdxMaxBytesLine = 1024 * 1024 * 8 // 8 MiB
 44 | 
 45 | 	// magicStringSPDX is the string we look for when trying to find an ID.
 46 | 	magicStringSPDX = "SPDX-License-Identifier:"
 47 | 
 48 | 	// magicNumberSPDX is... a bad parser hack that SPDX recommends.
 49 | 	magicNumberSPDX = 5
 50 | )
 51 | 
 52 | var (
 53 | 	// stripTrashSPDX is taken from the spdx tools repository.
 54 | 	stripTrashSPDX = regexp.MustCompile(`[^\w\s\d.\-\+()]+`)
 55 | )
 56 | 
 57 | // Spdx is based on the Software Package Data Exchange project. It is built
 58 | // with a slightly objectionable parser as prescribed in the official tools
 59 | // repo.
 60 | type Spdx struct {
 61 | 	Debug bool
 62 | 	Logf  func(format string, v ...interface{})
 63 | }
 64 | 
 65 | func (obj *Spdx) String() string {
 66 | 	return "spdx"
 67 | }
 68 | 
 69 | func (obj *Spdx) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
 70 | 	if info.FileInfo.IsDir() {
 71 | 		return nil, nil // skip
 72 | 	}
 73 | 	if len(data) == 0 {
 74 | 		return nil, nil // skip
 75 | 	}
 76 | 
 77 | 	ctx, cancel := context.WithCancel(ctx)
 78 | 	defer cancel()
 79 | 
 80 | 	licenseMap := make(map[string]struct{})
 81 | 
 82 | 	// An official parser for SPDX ID's seems be:
 83 | 	// https://github.com/spdx/tools-golang/blob/a16d50ee155238df280a68252acc25e9afb7acea/idsearcher/idsearcher.go#L269
 84 | 	// If it's meant to be that simplistic, we'll implement something
 85 | 	// similar. Please report bugs over there before you report them here =D
 86 | 
 87 | 	reader := bytes.NewReader(data)
 88 | 	scanner := bufio.NewScanner(reader)
 89 | 	buf := []byte{}                       // create a buffer for very long lines
 90 | 	scanner.Buffer(buf, SpdxMaxBytesLine) // set the max size of that buffer
 91 | 	for scanner.Scan() {
 92 | 		// In an effort to short-circuit things if needed, we run a
 93 | 		// check ourselves and break out early if we see that we have
 94 | 		// cancelled early.
 95 | 		select {
 96 | 		case <-ctx.Done():
 97 | 			return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early")
 98 | 		default:
 99 | 		}
100 | 
101 | 		s := scanner.Text()                           // newlines will be stripped here
102 | 		strs := strings.SplitN(s, magicStringSPDX, 2) // max split of 2
103 | 		if len(strs) == 1 {                           // no split happened, string not found
104 | 			continue
105 | 		}
106 | 
107 | 		// weird way to parse, but whatever:
108 | 		// "if prefixed by more than n characters, it's probably not a
109 | 		// short-form ID; it's probably code to detect short-form IDs."
110 | 		if len(stripTrash(strs[0])) > magicNumberSPDX { // arbitrary wat
111 | 			continue
112 | 		}
113 | 
114 | 		// spdx says: "stop before trailing */ if it is present"
115 | 		lid := strings.Split(strs[1], "*/")[0] // lid is licenseID
116 | 		lid = strings.TrimSpace(lid)
117 | 		lid = stripTrash(lid)
118 | 
119 | 		licenseMap[lid] = struct{}{}
120 | 	}
121 | 	var skip error
122 | 	scannerErr := scanner.Err()
123 | 	if scannerErr == bufio.ErrTooLong {
124 | 		skip = scannerErr // add to ignored files...
125 | 		scannerErr = nil  // reset
126 | 	}
127 | 
128 | 	ids := []string{}
129 | 	for id := range licenseMap {
130 | 		ids = append(ids, id)
131 | 	}
132 | 	sort.Strings(ids) // deterministic order
133 | 
134 | 	licenseList := []*licenses.License{}
135 | 
136 | 	for _, id := range ids {
137 | 		license := &licenses.License{
138 | 			SPDX: id,
139 | 			// TODO: populate other fields here?
140 | 		}
141 | 
142 | 		// If we find an unknown SPDX ID, we don't want to error,
143 | 		// because that would allow someone to put junk in their code to
144 | 		// prevent us scanning it. Instead, create an invalid license
145 | 		// but return it anyways. If we ever want to check validity, we
146 | 		// know to expect failures. It *must* be valid because it's an
147 | 		// explicit SPDX scanner.
148 | 		if err := license.Validate(); err != nil {
149 | 			//return nil, err
150 | 			license = &licenses.License{
151 | 				//SPDX: "",
152 | 				Origin: "", // unknown!
153 | 				Custom: id,
154 | 				// TODO: populate other fields here (eg: found license text)
155 | 			}
156 | 		}
157 | 
158 | 		licenseList = append(licenseList, license)
159 | 	}
160 | 
161 | 	if len(licenseMap) == 0 && skip == nil {
162 | 		// NOTE: this is NOT the same as interfaces.ErrUnknownLicense
163 | 		// because in this scenario, we're comfortable (ish) the parser
164 | 		// is exhaustive at finding a license with this methodology.
165 | 		// We want to return nil, but we error only if Scanner.Err() did
166 | 		// and so normally this returns nil, nil.
167 | 		return nil, errwrap.Wrapf(scannerErr, "spdx scanner error")
168 | 	}
169 | 
170 | 	result := &interfaces.Result{
171 | 		Licenses:   licenseList,
172 | 		Confidence: 1.0, // TODO: what should we put here?
173 | 		Skip:       skip,
174 | 	}
175 | 
176 | 	// We perform the strange task of processing any partial results, and
177 | 	// returning some even if we errored, because the spdx code seems to
178 | 	// think this is better than no results. I'll do the same, but there is
179 | 	// no guarantee the calling iterator will use these. (Currently it does
180 | 	// not!)
181 | 	return result, errwrap.Wrapf(scannerErr, "spdx scanner error")
182 | }
183 | 
184 | // stripTrash is an improved version of the identically named function in the
185 | // SPDX tools repository.
186 | func stripTrash(lid string) string {
187 | 	return stripTrashSPDX.ReplaceAllString(lid, "")
188 | }
189 | 


--------------------------------------------------------------------------------
/cmd/yesiscan/.gitignore:
--------------------------------------------------------------------------------
1 | # if you build the binary in this dir
2 | yesiscan
3 | .program
4 | .version
5 | dist/
6 | .envrc
7 | 


--------------------------------------------------------------------------------
/cmd/yesiscan/.goreleaser.yaml:
--------------------------------------------------------------------------------
 1 | # Make sure to check the documentation at https://goreleaser.com
 2 | before:
 3 |   hooks:
 4 |     # You may remove this if you don't use go modules.
 5 |     #- go mod tidy
 6 |     - go generate ./...
 7 | builds:
 8 |   - env:
 9 |       - CGO_ENABLED=0
10 |     goos:
11 |       - linux
12 |       - darwin
13 |     #  - windows
14 | 
15 |     goarch:
16 |       - amd64
17 |       - arm64
18 | 
19 |     ignore:
20 |       - goarch: 386
21 | 
22 |     ldflags:
23 |       - '-s -w -X main.program={{.ProjectName}} -X main.version={{.ShortCommit}}'
24 | 
25 | archives:
26 |   - format: binary
27 | #  - replacements:
28 | #      darwin: Darwin
29 | #      linux: Linux
30 | #      windows: Windows
31 | #      amd64: x86_64
32 | checksum:
33 |   name_template: 'checksums.txt'
34 | snapshot:
35 |   name_template: "{{ incpatch .Version }}-next"
36 | changelog:
37 |   sort: asc
38 |   filters:
39 |     exclude:
40 |       - '^docs:'
41 |       - '^test:'
42 | 


--------------------------------------------------------------------------------
/cmd/yesiscan/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all build release
 2 | 
 3 | # These values get pulled in from these magic dot files to make builds that
 4 | # already contain these values. They set these config values automatically.
 5 | AUTO := $(shell cat ../../.autoConfigURI 2>/dev/null || echo '')
 6 | COOKIE := $(shell cat ../../.autoConfigCookiePath 2>/dev/null || echo '')
 7 | 
 8 | all: build
 9 | 
10 | build:
11 | 	#@go build && echo "built binary to: $(PWD)/yesiscan"
12 | 	@go build -ldflags="-X main.autoConfigURI=$(AUTO) -X main.autoConfigCookiePath=$(COOKIE)" && echo "built binary to: $(PWD)/yesiscan"
13 | 
14 | release:
15 | 	goreleaser release --skip-validate --rm-dist
16 | 


--------------------------------------------------------------------------------
/cmd/yesiscan/web.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | package main
25 | 
26 | import (
27 | 	"context"
28 | 	"fmt"
29 | 	"os"
30 | 	"os/signal"
31 | 	"strings"
32 | 
33 | 	"github.com/awslabs/yesiscan/util/ansi"
34 | 	"github.com/awslabs/yesiscan/web"
35 | 
36 | 	cli "github.com/urfave/cli/v2" // imports as package "cli"
37 | )
38 | 
39 | // Web is the general entry point for running this software as an http web
40 | // server.
41 | // TODO: replace the *cli.Context with a more general context that can be used
42 | // by all the different frontends.
43 | func Web(c *cli.Context, program, version string, debug bool) error {
44 | 	logf := (&ansi.Logf{
45 | 		Prefix:   "main: ",
46 | 		Ellipsis: "...",
47 | 		Enable:   false,
48 | 		Prefixes: []string{},
49 | 	}).Init()
50 | 	logf("Hello from purpleidea! This is %s, version: %s", program, version)
51 | 	defer logf("Done!")
52 | 
53 | 	server := &web.Server{
54 | 		Program: program,
55 | 		Version: version,
56 | 
57 | 		Debug: debug,
58 | 		Logf: func(format string, v ...interface{}) {
59 | 			//logf(format, v...) // XXX: replaced for now b/c of gin logs
60 | 			fmt.Printf(strings.TrimRight(format, "\n")+"\n", v...) // avoid prefixing for now
61 | 		},
62 | 
63 | 		Profiles: c.StringSlice("profile"),
64 | 		Listen:   c.String("listen"),
65 | 	}
66 | 
67 | 	ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
68 | 	defer stop()
69 | 
70 | 	return server.Run(ctx)
71 | }
72 | 


--------------------------------------------------------------------------------
/examples/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Title: Example DESCRIPTION Files
 2 | Version: 1.4.1.9000
 3 | License: MIT +
 4 | 	file
 5 | 	LICENSE | GPL-2.0
 6 | License: Apache-2.0
 7 | Depends:
 8 | 	R (>= 3.4)
 9 | Imports:
10 | 	cli,
11 | 	utils
12 | Suggests:
13 | 	callr,
14 | 	covr
15 | Config/testthat/edition: 3
16 | Encoding: UTF-8
17 | Language: en-US
18 | Collate:
19 | 	'assertions.R'
20 | 


--------------------------------------------------------------------------------
/examples/big5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment": "an example profile for the big5, non-proliferation friendly licenses",
 3 |   "licenses": [
 4 |     "AGPL-3.0-or-later",
 5 |     "GPL-3.0-or-later",
 6 |     "LGPL-3.0-or-later",
 7 |     "AGPL-3.0-only",
 8 |     "GPL-3.0-only",
 9 |     "LGPL-3.0-only",
10 |     "AGPL-3.0",
11 |     "GPL-3.0",
12 |     "LGPL-3.0",
13 |     "AGPL-3.0+",
14 |     "GPL-3.0+",
15 |     "LGPL-3.0+",
16 | 
17 |     "LGPL-2.1-or-later",
18 |     "LGPL-2.1-only",
19 |     "LGPL-2.1",
20 |     "LGPL-2.1+",
21 | 
22 |     "GPL-2.0-or-later",
23 |     "LGPL-2.0-or-later",
24 |     "GPL-2.0-only",
25 |     "LGPL-2.0-only",
26 |     "GPL-2.0",
27 |     "LGPL-2.0",
28 |     "GPL-2.0+",
29 |     "LGPL-2.0+",
30 | 
31 |     "Apache-2.0",
32 |     "MIT",
33 |     "MIT-0"
34 |   ],
35 |   "exclude": true
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"quiet": false,
 3 | 	"profiles": ["big5", "default", "fsf"],
 4 | 	"regexp-path": "",
 5 | 	"output-type": "html",
 6 | 	"output-s3bucket": "yesiscan-test",
 7 | 	"region": "ca-central-1",
 8 | 	"backends": {
 9 | 		"licenseclassifier": false,
10 | 		"cran": true,
11 | 		"pom": true,
12 | 		"spdx": true,
13 | 		"askalono": true,
14 | 		"scancode": true,
15 | 		"bitbake": true,
16 | 		"regexp": true
17 | 	},
18 | 	"configs": {
19 | 		"~/.config/yesiscan/profiles/big5.json": "https://raw.githubusercontent.com/awslabs/yesiscan/main/examples/big5.json"
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/examples/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 2 | 
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>org.neo4j.build.plugins</groupId>
 6 |     <artifactId>license-maven-plugin</artifactId>
 7 |     <version>4-SNAPSHOT</version>
 8 |     <packaging>maven-plugin</packaging>
 9 | 
10 |     <properties>
11 |       <jdk>1.6</jdk>
12 |       <jdk.version>1.6</jdk.version>
13 |     </properties>
14 | 
15 |     <licenses>
16 |         <license>
17 |             <name>The Apache Software License, Version 2.0</name>
18 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
19 |             <distribution>repo</distribution>
20 |         </license>
21 |     </licenses>
22 | 
23 | </project>
24 | 


--------------------------------------------------------------------------------
/examples/regexp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment": "",
 3 |   "rules": [
 4 |     {
 5 |       "id": "AGPL-3.0-or-later",
 6 |       "pattern": "AGPL"
 7 |     },
 8 |     {
 9 |       "id": "AGPL-3.0-or-later",
10 |       "pattern": "AGPLv3"
11 |     },
12 |     {
13 |       "id": "GPL-3.0-or-later",
14 |       "pattern": "GPL"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/ssh.config:
--------------------------------------------------------------------------------
 1 | # run yesiscan on a remote host over ssh
 2 | # put this in your ~/.ssh/config
 3 | # then ssh yesiscan
 4 | # then run ./yesiscan web
 5 | # in your local webbrowser you can connect to http://localhost:8000
 6 | Host yesiscan
 7 | 	Hostname <your hostname over there>
 8 | 	User <your username>
 9 | 	
10 | 	LocalForward 8000 localhost:8000
11 | 	GSSAPIAuthentication no
12 | 	RequestTTY yes
13 | 	RemoteCommand screen -xRR
14 | 
15 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/awslabs/yesiscan
 2 | 
 3 | go 1.16
 4 | 
 5 | require (
 6 | 	github.com/aws/aws-sdk-go-v2 v1.16.11 // indirect
 7 | 	github.com/aws/aws-sdk-go-v2/config v1.17.1 // indirect
 8 | 	github.com/aws/aws-sdk-go-v2/service/s3 v1.27.5 // indirect
 9 | 	github.com/fatih/color v1.13.0 // indirect
10 | 	github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3 // indirect
11 | 	github.com/gin-gonic/gin v1.8.1 // indirect
12 | 	github.com/go-git/go-git/v5 v5.3.0 // indirect
13 | 	github.com/go-playground/validator/v10 v10.10.0 // indirect
14 | 	github.com/google/licenseclassifier v0.0.0-20210325184830-bb04aff29e72 // indirect
15 | 	github.com/hashicorp/go-multierror v1.1.1 // indirect
16 | 	github.com/mitchellh/go-homedir v1.1.0 // indirect
17 | 	github.com/pkg/errors v0.9.1 // indirect
18 | 	github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
19 | 	github.com/ssgelm/cookiejarparser v1.0.1 // indirect
20 | 	github.com/urfave/cli/v2 v2.14.1 // indirect
21 | 	golang.org/x/term v0.1.0 // indirect
22 | )
23 | 


--------------------------------------------------------------------------------
/iterator/bzip2.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package iterator
 25 | 
 26 | import (
 27 | 	"compress/bzip2"
 28 | 	"context"
 29 | 	"crypto/sha256"
 30 | 	"fmt"
 31 | 	"io"
 32 | 	"os"
 33 | 	"strconv"
 34 | 	"strings"
 35 | 	"sync"
 36 | 	"time"
 37 | 
 38 | 	"github.com/awslabs/yesiscan/interfaces"
 39 | 	"github.com/awslabs/yesiscan/util/errwrap"
 40 | 	"github.com/awslabs/yesiscan/util/safepath"
 41 | )
 42 | 
 43 | var (
 44 | 	// Bzip2Extensions is a list of valid extensions.
 45 | 	Bzip2Extensions = []string{
 46 | 		".bz",
 47 | 		".bz2",
 48 | 		//".bzip" // not actually an extension that's used!
 49 | 		".bzip2",
 50 | 		".tbz",
 51 | 		".tbz2",
 52 | 		//".tbzip2", // not actually an extension that's used!
 53 | 		//".tar.bz",
 54 | 		//".tar.bz2",
 55 | 		//".tar.bzip2",
 56 | 	}
 57 | 
 58 | 	bzip2MapMutex *sync.Mutex
 59 | 	bzip2Mutexes  map[string]*sync.Mutex
 60 | )
 61 | 
 62 | func init() {
 63 | 	bzip2MapMutex = &sync.Mutex{}
 64 | 	bzip2Mutexes = make(map[string]*sync.Mutex)
 65 | }
 66 | 
 67 | // Bzip2 is an iterator that takes a .bz or similar URI to open and performs the
 68 | // decompress operation. It will eventually return an Fs iterator since there's
 69 | // no need for it to know how to walk through a filesystem tree itself and it's
 70 | // going to return a single file here. It can use a local cache so that future
 71 | // calls to the same URI won't have to waste cycles, but only in cases when we
 72 | // can determine it will be the same file.
 73 | type Bzip2 struct {
 74 | 	Debug  bool
 75 | 	Logf   func(format string, v ...interface{})
 76 | 	Prefix safepath.AbsDir
 77 | 
 78 | 	// Parser is a pointer to the parser that returned this. If it wasn't
 79 | 	// returned by a parser, leave this nil. If this iterator came from an
 80 | 	// iterator, then the Iterator handle should be filled instead.
 81 | 	Parser interfaces.Parser
 82 | 
 83 | 	// Iterator is a pointer to the iterator that returned this. If it
 84 | 	// wasn't returned by an iterator, leave this nil. If this iterator came
 85 | 	// from a parser, then the Parser handle should be filled instead.
 86 | 	Iterator interfaces.Iterator
 87 | 
 88 | 	// Path is the location of the file to gunzip.
 89 | 	Path safepath.AbsFile
 90 | 
 91 | 	// AllowAnyExtension specifies whether we will attempt to run if the
 92 | 	// Path does not end with the correct bzip2 extension.
 93 | 	AllowAnyExtension bool
 94 | 
 95 | 	// AllowedExtensions specifies a list of extensions that we are allowed
 96 | 	// to try to decode from. If this is empty, then we allow only the
 97 | 	// defaults above because allowing no extensions at all would make no
 98 | 	// sense. If AllowAnyExtension is set, then this has no effect. All the
 99 | 	// matches are case insensitive.
100 | 	AllowedExtensions []string
101 | 
102 | 	// iterators store the list of which iterators we created, so we know
103 | 	// which ones we have to close!
104 | 	iterators []interfaces.Iterator
105 | 
106 | 	// unlock is a function that should be called as part of the Close
107 | 	// method once this resource is finished. It can be defined when
108 | 	// building this iterator in case we want a mechanism for the caller of
109 | 	// this iterator to tell the child when to unlock any in-use resources.
110 | 	// It must be safe to call this function more than once if necessary.
111 | 	// This is currently used privately.
112 | 	unlock func()
113 | }
114 | 
115 | // String returns a human-readable representation of the bzip2 path we're
116 | // looking at. The output of this format is not guaranteed to be constant, so
117 | // don't try to parse it.
118 | func (obj *Bzip2) String() string {
119 | 	return fmt.Sprintf("bzip2: %s", obj.Path)
120 | }
121 | 
122 | // Validate runs some checks to ensure this iterator was built correctly.
123 | func (obj *Bzip2) Validate() error {
124 | 	if obj.Logf == nil {
125 | 		return fmt.Errorf("the Logf function must be specified")
126 | 	}
127 | 	if err := obj.Prefix.Validate(); err != nil {
128 | 		return err
129 | 	}
130 | 
131 | 	if obj.Path.Path() == "" {
132 | 		return fmt.Errorf("must specify a Path")
133 | 	}
134 | 
135 | 	return obj.validateExtension()
136 | }
137 | 
138 | // validateExtension is a helper function to process our extension validation.
139 | func (obj *Bzip2) validateExtension() error {
140 | 	if obj.AllowAnyExtension {
141 | 		return nil
142 | 	}
143 | 	if len(obj.AllowedExtensions) == 0 {
144 | 		for _, x := range Bzip2Extensions {
145 | 			if obj.Path.HasExtInsensitive(x) {
146 | 				return nil
147 | 			}
148 | 		}
149 | 	}
150 | 
151 | 	for _, x := range obj.AllowedExtensions {
152 | 		if obj.Path.HasExtInsensitive(x) {
153 | 			return nil
154 | 		}
155 | 	}
156 | 
157 | 	if len(obj.AllowedExtensions) == 0 {
158 | 		return fmt.Errorf("a valid bzip2 extension is required without the allow any extension option")
159 | 	}
160 | 
161 | 	return fmt.Errorf("an allowed extension is required to run this iterator")
162 | }
163 | 
164 | // GetParser returns a handle to the parent parser that built this iterator if
165 | // there is one.
166 | func (obj *Bzip2) GetParser() interfaces.Parser { return obj.Parser }
167 | 
168 | // GetIterator returns a handle to the parent iterator that built this iterator
169 | // if there is one.
170 | func (obj *Bzip2) GetIterator() interfaces.Iterator { return obj.Iterator }
171 | 
172 | // Recurse runs a simple iterator that is responsible for uncompressing a bzip2
173 | // URI into a local filesystem path. If this happens successfully, it will
174 | // return a new FsIterator that is initialized to this root path.
175 | func (obj *Bzip2) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) {
176 | 	relDir := safepath.UnsafeParseIntoRelDir("bzip2/")
177 | 	prefix := safepath.JoinToAbsDir(obj.Prefix, relDir)
178 | 	if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil {
179 | 		return nil, err
180 | 	}
181 | 
182 | 	// make a unique ID for the directory
183 | 	// XXX: we can consider different algorithms or methods here later...
184 | 	now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64
185 | 	sum := sha256.Sum256([]byte(obj.Path.Path() + now))
186 | 	hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum))
187 | 	if err != nil {
188 | 		return nil, err
189 | 	}
190 | 	// ensure it gets put into a folder so it doesn't explode current dir
191 | 	bzip2AbsDir := safepath.JoinToAbsDir(prefix, hashRelDir)
192 | 
193 | 	bzip2MapMutex.Lock()
194 | 	mu, exists := bzip2Mutexes[obj.Path.Path()]
195 | 	if !exists {
196 | 		mu = &sync.Mutex{}
197 | 		bzip2Mutexes[obj.Path.Path()] = mu
198 | 	}
199 | 	bzip2MapMutex.Unlock()
200 | 
201 | 	if obj.Debug {
202 | 		obj.Logf("locking: %s", obj.String())
203 | 	}
204 | 	mu.Lock() // locking happens here (unlock on all errors/returns!)
205 | 	once := &sync.Once{}
206 | 	obj.unlock = func() {
207 | 		fn := func() {
208 | 			if obj.Debug {
209 | 				obj.Logf("unlocking: %s", obj.String())
210 | 			}
211 | 			mu.Unlock()
212 | 		}
213 | 		once.Do(fn)
214 | 	}
215 | 
216 | 	// XXX: unlock when context closes?
217 | 
218 | 	// XXX: If the destination dir has contents, consider removing them
219 | 	// first. This is one reason why we have a mutex.
220 | 
221 | 	// Open the bzip2 file for reading.
222 | 	// FIXME: use a variant that can take a context
223 | 	f, err := os.Open(obj.Path.Path())
224 | 	if err != nil {
225 | 		obj.unlock()
226 | 		return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path)
227 | 	}
228 | 	defer f.Close()
229 | 
230 | 	z := bzip2.NewReader(f)
231 | 
232 | 	bytesTotal := int64(0)
233 | 	// Iterate through the files in the archive.
234 | 	// TODO: add a recurring progress logf if it takes longer than 30 sec
235 | 
236 | 	// TODO: obj.Debug ?
237 | 
238 | 	newName := "unknown"
239 | 	p := obj.Path.Path()
240 | 	suffix := WhichSuffixInsensitive(p, Bzip2Extensions)
241 | 	p = strings.TrimSuffix(p, suffix)
242 | 	ix := strings.LastIndex(p, "/")
243 | 	if ix != -1 {
244 | 		p = p[ix+1:]
245 | 		if len(p) > 0 {
246 | 			newName = p
247 | 		}
248 | 	}
249 | 
250 | 	obj.Logf("bzip2: %s", newName)
251 | 
252 | 	// add in a .tar if it's an embedded tar file
253 | 	if p := strings.ToLower(obj.Path.Path()); strings.HasSuffix(p, ".tbz") || strings.HasSuffix(p, ".tbz2") {
254 | 		newName += ".tar"
255 | 	}
256 | 	relFile, err := safepath.ParseIntoRelFile(newName)
257 | 	if err != nil {
258 | 		// programming error
259 | 		obj.unlock()
260 | 		return nil, err
261 | 	}
262 | 
263 | 	// this is where the output file will be stored
264 | 	absFile := safepath.JoinToAbsFile(bzip2AbsDir, relFile)
265 | 
266 | 	// XXX: sanity check (is output in the dir?)
267 | 	// TODO: we could add this, but safepath automatically does this
268 | 	// if absFile is not inside of bzip2AbsDir then error
269 | 
270 | 	absDir := absFile.Dir() // get the absDir that absFile is in
271 | 
272 | 	if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil {
273 | 		// programming error
274 | 		obj.unlock()
275 | 		return nil, err
276 | 	}
277 | 
278 | 	// write to this location
279 | 	dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
280 | 	if err != nil {
281 | 		obj.unlock()
282 | 		return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
283 | 	}
284 | 	// don't `defer` close here because we want to free in the loop
285 | 
286 | 	// FIXME: use a variant that can take a context
287 | 	size, err := io.Copy(dest, z)
288 | 	if e, ok := err.(bzip2.StructuralError); ok {
289 | 		dest.Close() // close dest file on error!
290 | 		obj.unlock()
291 | 		return nil, &interfaces.IteratorError{
292 | 			Path: obj.Path.Path(),
293 | 			Err:  errwrap.Wrapf(e, "error decompressing bzip2"),
294 | 		}
295 | 
296 | 	} else if err != nil {
297 | 		dest.Close() // close dest file on error!
298 | 		obj.unlock()
299 | 		return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
300 | 	}
301 | 	obj.Logf("uncompressed: %d bytes to disk at %s", size, absFile)
302 | 
303 | 	dest.Close() // close dest file on error!
304 | 
305 | 	bytesTotal += int64(size)
306 | 
307 | 	// TODO: change to human readable bytes
308 | 	obj.Logf("uncompressed from %s into %s (%d bytes)", obj.String(), bzip2AbsDir, bytesTotal)
309 | 
310 | 	obj.iterators = []interfaces.Iterator{}
311 | 
312 | 	// if it's a single bzip2 file we return an fs iterator and let the fs
313 | 	// iterator sort that out...
314 | 	iterator := &Fs{
315 | 		Debug: obj.Debug,
316 | 		Logf: func(format string, v ...interface{}) {
317 | 			obj.Logf(format, v...) // TODO: add a prefix?
318 | 		},
319 | 		Prefix: obj.Prefix,
320 | 
321 | 		Iterator: obj,
322 | 
323 | 		Path: bzip2AbsDir,
324 | 
325 | 		//Unlock: unlock,
326 | 	}
327 | 	obj.iterators = append(obj.iterators, iterator)
328 | 
329 | 	return obj.iterators, nil
330 | }
331 | 
332 | // Close shuts down the iterator and/or performs clean up after the Recurse
333 | // method has run. This must be called if you run Recurse.
334 | func (obj *Bzip2) Close() error {
335 | 	if obj.unlock != nil {
336 | 		obj.unlock()
337 | 	}
338 | 	var errs error
339 | 	for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!)
340 | 		if err := obj.iterators[i].Close(); err != nil {
341 | 			errs = errwrap.Append(errs, err)
342 | 		}
343 | 	}
344 | 	return errs
345 | }
346 | 


--------------------------------------------------------------------------------
/iterator/gzip.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package iterator
 25 | 
 26 | import (
 27 | 	"compress/gzip"
 28 | 	"context"
 29 | 	"crypto/sha256"
 30 | 	"fmt"
 31 | 	"io"
 32 | 	"os"
 33 | 	"strconv"
 34 | 	"strings"
 35 | 	"sync"
 36 | 	"time"
 37 | 
 38 | 	"github.com/awslabs/yesiscan/interfaces"
 39 | 	"github.com/awslabs/yesiscan/util/errwrap"
 40 | 	"github.com/awslabs/yesiscan/util/safepath"
 41 | )
 42 | 
 43 | var (
 44 | 	// GzipExtensions is a list of valid extensions.
 45 | 	GzipExtensions = []string{
 46 | 		".gz",
 47 | 		".gzip",
 48 | 		".tgz",
 49 | 		//".tar.gz",
 50 | 		//".tar.gzip",
 51 | 	}
 52 | 
 53 | 	gzipMapMutex *sync.Mutex
 54 | 	gzipMutexes  map[string]*sync.Mutex
 55 | )
 56 | 
 57 | func init() {
 58 | 	gzipMapMutex = &sync.Mutex{}
 59 | 	gzipMutexes = make(map[string]*sync.Mutex)
 60 | }
 61 | 
 62 | // Gzip is an iterator that takes a .gz or similar URI to open and performs the
 63 | // decompress operation. It will eventually return an Fs iterator since there's
 64 | // no need for it to know how to walk through a filesystem tree itself and it's
 65 | // going to return a single file here. It can use a local cache so that future
 66 | // calls to the same URI won't have to waste cycles, but only in cases when we
 67 | // can determine it will be the same file. This does _not_ support gzip
 68 | // multistream, but it could be added if we find a use-case for it.
 69 | type Gzip struct {
 70 | 	Debug  bool
 71 | 	Logf   func(format string, v ...interface{})
 72 | 	Prefix safepath.AbsDir
 73 | 
 74 | 	// Parser is a pointer to the parser that returned this. If it wasn't
 75 | 	// returned by a parser, leave this nil. If this iterator came from an
 76 | 	// iterator, then the Iterator handle should be filled instead.
 77 | 	Parser interfaces.Parser
 78 | 
 79 | 	// Iterator is a pointer to the iterator that returned this. If it
 80 | 	// wasn't returned by an iterator, leave this nil. If this iterator came
 81 | 	// from a parser, then the Parser handle should be filled instead.
 82 | 	Iterator interfaces.Iterator
 83 | 
 84 | 	// Path is the location of the file to gunzip.
 85 | 	Path safepath.AbsFile
 86 | 
 87 | 	// AllowAnyExtension specifies whether we will attempt to run if the
 88 | 	// Path does not end with the correct gzip extension.
 89 | 	AllowAnyExtension bool
 90 | 
 91 | 	// AllowedExtensions specifies a list of extensions that we are allowed
 92 | 	// to try to decode from. If this is empty, then we allow only the
 93 | 	// defaults above because allowing no extensions at all would make no
 94 | 	// sense. If AllowAnyExtension is set, then this has no effect. All the
 95 | 	// matches are case insensitive.
 96 | 	AllowedExtensions []string
 97 | 
 98 | 	// iterators store the list of which iterators we created, so we know
 99 | 	// which ones we have to close!
100 | 	iterators []interfaces.Iterator
101 | 
102 | 	// unlock is a function that should be called as part of the Close
103 | 	// method once this resource is finished. It can be defined when
104 | 	// building this iterator in case we want a mechanism for the caller of
105 | 	// this iterator to tell the child when to unlock any in-use resources.
106 | 	// It must be safe to call this function more than once if necessary.
107 | 	// This is currently used privately.
108 | 	unlock func()
109 | }
110 | 
111 | // String returns a human-readable representation of the gzip path we're looking
112 | // at. The output of this format is not guaranteed to be constant, so don't try
113 | // to parse it.
114 | func (obj *Gzip) String() string {
115 | 	return fmt.Sprintf("gzip: %s", obj.Path)
116 | }
117 | 
118 | // Validate runs some checks to ensure this iterator was built correctly.
119 | func (obj *Gzip) Validate() error {
120 | 	if obj.Logf == nil {
121 | 		return fmt.Errorf("the Logf function must be specified")
122 | 	}
123 | 	if err := obj.Prefix.Validate(); err != nil {
124 | 		return err
125 | 	}
126 | 
127 | 	if obj.Path.Path() == "" {
128 | 		return fmt.Errorf("must specify a Path")
129 | 	}
130 | 
131 | 	return obj.validateExtension()
132 | }
133 | 
134 | // validateExtension is a helper function to process our extension validation.
135 | func (obj *Gzip) validateExtension() error {
136 | 	if obj.AllowAnyExtension {
137 | 		return nil
138 | 	}
139 | 	if len(obj.AllowedExtensions) == 0 {
140 | 		for _, x := range GzipExtensions {
141 | 			if obj.Path.HasExtInsensitive(x) {
142 | 				return nil
143 | 			}
144 | 		}
145 | 	}
146 | 
147 | 	for _, x := range obj.AllowedExtensions {
148 | 		if obj.Path.HasExtInsensitive(x) {
149 | 			return nil
150 | 		}
151 | 	}
152 | 
153 | 	if len(obj.AllowedExtensions) == 0 {
154 | 		return fmt.Errorf("a valid gzip extension is required without the allow any extension option")
155 | 	}
156 | 
157 | 	return fmt.Errorf("an allowed extension is required to run this iterator")
158 | }
159 | 
160 | // GetParser returns a handle to the parent parser that built this iterator if
161 | // there is one.
162 | func (obj *Gzip) GetParser() interfaces.Parser { return obj.Parser }
163 | 
164 | // GetIterator returns a handle to the parent iterator that built this iterator
165 | // if there is one.
166 | func (obj *Gzip) GetIterator() interfaces.Iterator { return obj.Iterator }
167 | 
168 | // Recurse runs a simple iterator that is responsible for uncompressing a gzip
169 | // URI into a local filesystem path. If this happens successfully, it will
170 | // return a new FsIterator that is initialized to this root path.
171 | func (obj *Gzip) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) {
172 | 	relDir := safepath.UnsafeParseIntoRelDir("gzip/")
173 | 	prefix := safepath.JoinToAbsDir(obj.Prefix, relDir)
174 | 	if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil {
175 | 		return nil, err
176 | 	}
177 | 
178 | 	// make a unique ID for the directory
179 | 	// XXX: we can consider different algorithms or methods here later...
180 | 	now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64
181 | 	sum := sha256.Sum256([]byte(obj.Path.Path() + now))
182 | 	hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum))
183 | 	if err != nil {
184 | 		return nil, err
185 | 	}
186 | 	// ensure it gets put into a folder so it doesn't explode current dir
187 | 	gzipAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir)
188 | 
189 | 	gzipMapMutex.Lock()
190 | 	mu, exists := gzipMutexes[obj.Path.Path()]
191 | 	if !exists {
192 | 		mu = &sync.Mutex{}
193 | 		gzipMutexes[obj.Path.Path()] = mu
194 | 	}
195 | 	gzipMapMutex.Unlock()
196 | 
197 | 	if obj.Debug {
198 | 		obj.Logf("locking: %s", obj.String())
199 | 	}
200 | 	mu.Lock() // locking happens here (unlock on all errors/returns!)
201 | 	once := &sync.Once{}
202 | 	obj.unlock = func() {
203 | 		fn := func() {
204 | 			if obj.Debug {
205 | 				obj.Logf("unlocking: %s", obj.String())
206 | 			}
207 | 			mu.Unlock()
208 | 		}
209 | 		once.Do(fn)
210 | 	}
211 | 
212 | 	// XXX: unlock when context closes?
213 | 
214 | 	// XXX: If the destination dir has contents, consider removing them
215 | 	// first. This is one reason why we have a mutex.
216 | 
217 | 	// Open the gzip file for reading.
218 | 	// FIXME: use a variant that can take a context
219 | 	f, err := os.Open(obj.Path.Path())
220 | 	if err != nil {
221 | 		obj.unlock()
222 | 		return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path)
223 | 	}
224 | 	defer f.Close()
225 | 
226 | 	z, err := gzip.NewReader(f)
227 | 	if err != nil {
228 | 		obj.unlock()
229 | 		return nil, errwrap.Wrapf(err, "error reading gzip %s", obj.Path)
230 | 	}
231 | 	defer z.Close()
232 | 	z.Multistream(false) // TODO: do we ever want to allow this here?
233 | 
234 | 	filesTotal := 0
235 | 	bytesTotal := int64(0)
236 | 	// Iterate through the files in the archive.
237 | 	// TODO: add a recurring progress logf if it takes longer than 30 sec
238 | 	for {
239 | 		// In an effort to short-circuit things if needed, we run a
240 | 		// check ourselves and break out early if we see that we have
241 | 		// cancelled early.
242 | 		select {
243 | 		case <-ctx.Done():
244 | 			obj.unlock()
245 | 			return nil, errwrap.Wrapf(ctx.Err(), "ended decompressing early")
246 | 		default:
247 | 		}
248 | 
249 | 		if s := z.Header.Comment; s != "" {
250 | 			obj.Logf("gzip has comment: %s", s)
251 | 		}
252 | 
253 | 		// TODO: obj.Debug ?
254 | 		name := z.Header.Name
255 | 		newName := name
256 | 		if name != "" {
257 | 			obj.Logf("gzip: %s", name)
258 | 		} else {
259 | 			// a .tgz might have no name string for example
260 | 			obj.Logf("gzip name is empty")
261 | 			newName = "unknown"
262 | 			p := obj.Path.Path()
263 | 			suffix := WhichSuffixInsensitive(p, GzipExtensions)
264 | 			p = strings.TrimSuffix(p, suffix)
265 | 			ix := strings.LastIndex(p, "/")
266 | 			if ix != -1 {
267 | 				p = p[ix+1:]
268 | 				if len(p) > 0 {
269 | 					newName = p
270 | 				}
271 | 				obj.Logf("gzip basename: %s", newName)
272 | 			}
273 | 		}
274 | 
275 | 		// add in a .tar if it's an embedded tar file
276 | 		if strings.HasSuffix(strings.ToLower(obj.Path.Path()), ".tgz") {
277 | 			newName += ".tar"
278 | 		}
279 | 		relFile, err := safepath.ParseIntoRelFile(newName)
280 | 		if err != nil {
281 | 			// programming error
282 | 			obj.unlock()
283 | 			return nil, err
284 | 		}
285 | 
286 | 		// this is where the output file will be stored
287 | 		absFile := safepath.JoinToAbsFile(gzipAbsDir, relFile)
288 | 
289 | 		// XXX: sanity check (is output in the dir?)
290 | 		// TODO: we could add this, but safepath automatically does this
291 | 		// if absFile is not inside of gzipAbsDir then error
292 | 
293 | 		absDir := absFile.Dir() // get the absDir that absFile is in
294 | 
295 | 		if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil {
296 | 			// programming error
297 | 			obj.unlock()
298 | 			return nil, err
299 | 		}
300 | 
301 | 		// write to this location
302 | 		dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
303 | 		if err != nil {
304 | 			obj.unlock()
305 | 			return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
306 | 		}
307 | 		// don't `defer` close here because we want to free in the loop
308 | 
309 | 		// FIXME: use a variant that can take a context
310 | 		size, err := io.Copy(dest, z)
311 | 		if err != nil {
312 | 			dest.Close() // close dest file on error!
313 | 			obj.unlock()
314 | 			return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
315 | 		}
316 | 		obj.Logf("uncompressed: %d bytes to disk at %s", size, absFile)
317 | 
318 | 		dest.Close() // close dest file on error!
319 | 
320 | 		filesTotal++
321 | 		bytesTotal += int64(size)
322 | 
323 | 		break // TODO: remove if we ever do multistream
324 | 	}
325 | 
326 | 	// TODO: change to human readable bytes
327 | 	obj.Logf("uncompressed: %d files from %s into %s (%d bytes)", filesTotal, obj.String(), gzipAbsDir, bytesTotal)
328 | 
329 | 	obj.iterators = []interfaces.Iterator{}
330 | 
331 | 	// if it's a single gzip file we return an fs iterator and let the fs
332 | 	// iterator sort that out...
333 | 	iterator := &Fs{
334 | 		Debug: obj.Debug,
335 | 		Logf: func(format string, v ...interface{}) {
336 | 			obj.Logf(format, v...) // TODO: add a prefix?
337 | 		},
338 | 		Prefix: obj.Prefix,
339 | 
340 | 		Iterator: obj,
341 | 
342 | 		Path: gzipAbsDir,
343 | 
344 | 		//Unlock: unlock,
345 | 	}
346 | 	obj.iterators = append(obj.iterators, iterator)
347 | 
348 | 	return obj.iterators, nil
349 | }
350 | 
351 | // Close shuts down the iterator and/or performs clean up after the Recurse
352 | // method has run. This must be called if you run Recurse.
353 | func (obj *Gzip) Close() error {
354 | 	if obj.unlock != nil {
355 | 		obj.unlock()
356 | 	}
357 | 	var errs error
358 | 	for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!)
359 | 		if err := obj.iterators[i].Close(); err != nil {
360 | 			errs = errwrap.Append(errs, err)
361 | 		}
362 | 	}
363 | 	return errs
364 | }
365 | 


--------------------------------------------------------------------------------
/iterator/http.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package iterator
 25 | 
 26 | import (
 27 | 	"context"
 28 | 	"crypto/sha256"
 29 | 	"fmt"
 30 | 	"io"
 31 | 	"net/http"
 32 | 	"net/url"
 33 | 	"os"
 34 | 	"strconv"
 35 | 	"strings"
 36 | 	"sync"
 37 | 	"time"
 38 | 
 39 | 	"github.com/awslabs/yesiscan/interfaces"
 40 | 	"github.com/awslabs/yesiscan/util/errwrap"
 41 | 	"github.com/awslabs/yesiscan/util/safepath"
 42 | )
 43 | 
 44 | const (
 45 | 	// HttpScheme is the standard prefix used for http URL's.
 46 | 	HttpScheme = "http://"
 47 | 
 48 | 	// HttpsScheme is the standard prefix used for https URL's.
 49 | 	HttpsScheme = "https://"
 50 | 
 51 | 	// HttpSchemeRaw is the standard prefix used for http URL's but without
 52 | 	// the scheme protocol separator which is <colon-slash-slash>.
 53 | 	HttpSchemeRaw = "http"
 54 | 
 55 | 	// HttpsSchemeRaw is the standard prefix used for https URL's but
 56 | 	// without the scheme protocol separator which is <colon-slash-slash>.
 57 | 	HttpsSchemeRaw = "https"
 58 | 
 59 | 	// UnknownFileName is the filename used when the URL doesn't have an
 60 | 	// obvious filename at the end that we can use.
 61 | 	// TODO: is there a better name we can use? This is mostly arbitrary.
 62 | 	UnknownFileName = ".unknown"
 63 | )
 64 | 
 65 | var (
 66 | 	httpMapMutex *sync.Mutex
 67 | 	httpMutexes  map[string]*sync.Mutex
 68 | )
 69 | 
 70 | func init() {
 71 | 	httpMapMutex = &sync.Mutex{}
 72 | 	httpMutexes = make(map[string]*sync.Mutex)
 73 | }
 74 | 
 75 | // Http is an iterator that takes an http URL to download and performs the
 76 | // download operation. It will eventually return an Fs iterator since there's no
 77 | // need for it to know how to walk through a filesystem tree itself. It can use
 78 | // a local cache so that future calls to the same URL won't have to waste
 79 | // bandwidth or cycles again but only in cases when we can determine it will be
 80 | // the same file. Please note this is named http, but we obviously support https
 81 | // as the most common form of this.
 82 | type Http struct {
 83 | 	Debug  bool
 84 | 	Logf   func(format string, v ...interface{})
 85 | 	Prefix safepath.AbsDir
 86 | 
 87 | 	// Parser is a pointer to the parser that returned this. If it wasn't
 88 | 	// returned by a parser, leave this nil. If this iterator came from an
 89 | 	// iterator, then the Iterator handle should be filled instead.
 90 | 	Parser interfaces.Parser
 91 | 
 92 | 	// Iterator is a pointer to the iterator that returned this. If it
 93 | 	// wasn't returned by an iterator, leave this nil. If this iterator came
 94 | 	// from a parser, then the Parser handle should be filled instead.
 95 | 	Iterator interfaces.Iterator
 96 | 
 97 | 	// URL is the http URL of the file that we want to download.
 98 | 	// TODO: consider doing some clever parsing of well-known paths like
 99 | 	// github-style URL's or internal company code repository URL's.
100 | 	URL string
101 | 
102 | 	// AllowHttp specifies whether we're allowed to download http
103 | 	// (unencrypted) URLs.
104 | 	AllowHttp bool
105 | 
106 | 	// iterators store the list of which iterators we created, so we know
107 | 	// which ones we have to close!
108 | 	iterators []interfaces.Iterator
109 | 
110 | 	// unlock is a function that should be called as part of the Close
111 | 	// method once this resource is finished. It can be defined when
112 | 	// building this iterator in case we want a mechanism for the caller of
113 | 	// this iterator to tell the child when to unlock any in-use resources.
114 | 	// It must be safe to call this function more than once if necessary.
115 | 	// This is currently used privately.
116 | 	unlock func()
117 | }
118 | 
119 | // String returns a human-readable representation of the http URL we're looking
120 | // at. The output of this format is not guaranteed to be constant, so don't try
121 | // to parse it.
122 | func (obj *Http) String() string {
123 | 	return fmt.Sprintf("http: %s", obj.URL)
124 | }
125 | 
126 | // Validate runs some checks to ensure this iterator was built correctly.
127 | func (obj *Http) Validate() error {
128 | 	if obj.Logf == nil {
129 | 		return fmt.Errorf("the Logf function must be specified")
130 | 	}
131 | 	if err := obj.Prefix.Validate(); err != nil {
132 | 		return err
133 | 	}
134 | 
135 | 	if obj.URL == "" {
136 | 		return fmt.Errorf("must specify a URL")
137 | 	}
138 | 
139 | 	if _, err := url.Parse(obj.URL); err != nil {
140 | 		return err // not that url.Parse ever really errors :/
141 | 	}
142 | 
143 | 	isHttp := strings.HasPrefix(strings.ToLower(obj.URL), HttpScheme)
144 | 	isHttps := strings.HasPrefix(strings.ToLower(obj.URL), HttpsScheme)
145 | 	if !isHttp && !isHttps {
146 | 		return fmt.Errorf("invalid scheme")
147 | 	}
148 | 
149 | 	if isHttp && !obj.AllowHttp {
150 | 		// did you mean https ?
151 | 		return fmt.Errorf("the http scheme is not allowed without the allow http option")
152 | 	}
153 | 
154 | 	return nil
155 | }
156 | 
157 | // GetParser returns a handle to the parent parser that built this iterator if
158 | // there is one.
159 | func (obj *Http) GetParser() interfaces.Parser { return obj.Parser }
160 | 
161 | // GetIterator returns a handle to the parent iterator that built this iterator
162 | // if there is one.
163 | func (obj *Http) GetIterator() interfaces.Iterator { return obj.Iterator }
164 | 
165 | // Recurse runs a simple iterator that is responsible for downloading an http
166 | // url into a local filesystem path. If this happens successfully, it
167 | // will return a new FsIterator that is initialized to this root path.
168 | func (obj *Http) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) {
169 | 	relDir := safepath.UnsafeParseIntoRelDir("http/")
170 | 	prefix := safepath.JoinToAbsDir(obj.Prefix, relDir)
171 | 	if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil {
172 | 		return nil, err
173 | 	}
174 | 
175 | 	// make a unique ID for the directory
176 | 	// XXX: we can consider different algorithms or methods here later...
177 | 	now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64
178 | 	sum := sha256.Sum256([]byte(obj.URL + now))
179 | 	hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum))
180 | 	if err != nil {
181 | 		return nil, err
182 | 	}
183 | 	httpAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir)
184 | 
185 | 	httpMapMutex.Lock()
186 | 	mu, exists := httpMutexes[obj.URL]
187 | 	if !exists {
188 | 		mu = &sync.Mutex{}
189 | 		httpMutexes[obj.URL] = mu
190 | 	}
191 | 	httpMapMutex.Unlock()
192 | 
193 | 	if obj.Debug {
194 | 		obj.Logf("locking: %s", obj.String())
195 | 	}
196 | 	mu.Lock() // locking happens here (unlock on all errors/returns!)
197 | 	once := &sync.Once{}
198 | 	obj.unlock = func() {
199 | 		fn := func() {
200 | 			if obj.Debug {
201 | 				obj.Logf("unlocking: %s", obj.String())
202 | 			}
203 | 			mu.Unlock()
204 | 		}
205 | 		once.Do(fn)
206 | 	}
207 | 
208 | 	// XXX: unlock when context closes?
209 | 
210 | 	u, err := url.Parse(obj.URL)
211 | 	if err != nil {
212 | 		// programming error
213 | 		obj.unlock()
214 | 		return nil, errwrap.Wrapf(err, "error parsing URL %s", obj.URL)
215 | 	}
216 | 	segments := strings.Split(u.Path, "/")
217 | 	fileName := UnknownFileName // default
218 | 	if len(segments) > 0 {
219 | 		fileName = segments[len(segments)-1]
220 | 	}
221 | 
222 | 	relFile, err := safepath.ParseIntoRelFile(fileName)
223 | 	if err != nil {
224 | 		// programming error
225 | 		obj.unlock()
226 | 		return nil, err
227 | 	}
228 | 
229 | 	//directory := httpAbsDir.Path()
230 | 	fullFileNameAbsFile := safepath.JoinToAbsFile(httpAbsDir, relFile)
231 | 	fullFileName := fullFileNameAbsFile.Path()
232 | 
233 | 	// make the dir we put the downloaded file into
234 | 	if err := os.MkdirAll(httpAbsDir.Path(), interfaces.Umask); err != nil {
235 | 		obj.unlock()
236 | 		return nil, err
237 | 	}
238 | 
239 | 	// This is one reason why we have a mutex.
240 | 	if _, err := os.Stat(fullFileName); err == nil {
241 | 		obj.Logf("file %s already exists, overwriting", obj.String())
242 | 	}
243 | 
244 | 	// create blank file
245 | 	file, err := os.Create(fullFileName)
246 | 	if err != nil {
247 | 		obj.unlock()
248 | 		return nil, errwrap.Wrapf(err, "error writing file %s", fullFileNameAbsFile)
249 | 	}
250 | 	defer file.Close()
251 | 
252 | 	obj.Logf("downloading %s into %s as %s", obj.URL, httpAbsDir, fileName)
253 | 
254 | 	req, err := http.NewRequestWithContext(ctx, "GET", obj.URL, nil) // XXX: nil?
255 | 	if err != nil {
256 | 		obj.unlock()
257 | 		return nil, errwrap.Wrapf(err, "error building request for %s", obj.URL)
258 | 	}
259 | 
260 | 	//tr := &http.Transport{
261 | 	//	IdleConnTimeout: 30 * time.Second,
262 | 	//}
263 | 	client := &http.Client{
264 | 		//Transport: tr,
265 | 
266 | 		// If CheckRedirect is nil, the Client uses its default policy,
267 | 		// which is to stop after 10 consecutive requests.
268 | 		// CheckRedirect func(req *Request, via []*Request) error
269 | 		CheckRedirect: nil,
270 | 	}
271 | 
272 | 	// TODO: add a recurring progress logf if it takes longer than 30 sec
273 | 	resp, err := client.Do(req)
274 | 	if err != nil {
275 | 		obj.unlock()
276 | 		return nil, errwrap.Wrapf(err, "error do-ing request for %s", obj.URL)
277 | 	}
278 | 	defer resp.Body.Close()
279 | 
280 | 	// TODO: should we allow others?
281 | 	if resp.StatusCode != 200 {
282 | 		obj.unlock()
283 | 		return nil, fmt.Errorf("bad status code of: %d", resp.StatusCode)
284 | 	}
285 | 
286 | 	// FIXME: add a variant that can take a context
287 | 	size, err := io.Copy(file, resp.Body)
288 | 	if err != nil {
289 | 		obj.unlock()
290 | 		return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", fullFileNameAbsFile)
291 | 	}
292 | 	obj.Logf("copied: %d bytes to disk at %s", size, fullFileNameAbsFile)
293 | 
294 | 	obj.iterators = []interfaces.Iterator{}
295 | 
296 | 	if strings.HasPrefix(obj.URL, HttpScheme) {
297 | 		u.Scheme = HttpSchemeRaw
298 | 	}
299 | 	if strings.HasPrefix(obj.URL, HttpsScheme) {
300 | 		u.Scheme = HttpsSchemeRaw
301 | 	}
302 | 	u.Opaque = ""                         // encoded opaque data
303 | 	if _, has := u.User.Password(); has { // redact password
304 | 		u.User = url.UserPassword(u.User.Username(), "")
305 | 	}
306 | 	//u.Host = ? // host or host:port
307 | 
308 | 	u.RawPath = ""       // encoded path hint (see EscapedPath method)
309 | 	u.ForceQuery = false // append a query ('?') even if RawQuery is empty
310 | 	v := url.Values{}
311 | 	v.Set("now", now)
312 | 	u.RawQuery = v.Encode() // encoded query values, without '?'
313 | 	u.Fragment = ""         // fragment for references, without '#'
314 | 	u.RawFragment = ""      // encoded fragment hint (see EscapedFragment method)
315 | 
316 | 	// XXX: if it's a single zip file do we return a zip iterator here or do
317 | 	// we let the fs iterator sort that out...
318 | 	iterator := &Fs{
319 | 		Debug: obj.Debug,
320 | 		Logf: func(format string, v ...interface{}) {
321 | 			obj.Logf(format, v...) // TODO: add a prefix?
322 | 		},
323 | 		Prefix: obj.Prefix,
324 | 
325 | 		Iterator: obj,
326 | 
327 | 		// XXX: what path?
328 | 		Path: httpAbsDir,
329 | 
330 | 		GenUID: func(safePath safepath.Path) (string, error) {
331 | 			if !safepath.HasPrefix(safePath, httpAbsDir) {
332 | 				// programming error
333 | 				return "", fmt.Errorf("path doesn't have prefix")
334 | 			}
335 | 
336 | 			p := ""
337 | 			// remove httpAbsDir prefix from safePath to get a relPath
338 | 			relPath, err := safepath.StripPrefix(safePath, httpAbsDir)
339 | 			if err == nil {
340 | 				p = relPath.String()
341 | 			} else if err != nil && safePath.String() != httpAbsDir.String() {
342 | 				// programming error
343 | 				return "", errwrap.Wrapf(err, "problem stripping prefix")
344 | 			}
345 | 
346 | 			x := *u // copy
347 | 			x.Path += "/" + p
348 | 
349 | 			return x.String(), nil
350 | 		},
351 | 
352 | 		//Unlock: unlock,
353 | 	}
354 | 	obj.iterators = append(obj.iterators, iterator)
355 | 
356 | 	return obj.iterators, nil
357 | }
358 | 
359 | // Close shuts down the iterator and/or performs clean up after the Recurse
360 | // method has run. This must be called if you run Recurse.
361 | func (obj *Http) Close() error {
362 | 	if obj.unlock != nil {
363 | 		obj.unlock()
364 | 	}
365 | 	var errs error
366 | 	for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!)
367 | 		if err := obj.iterators[i].Close(); err != nil {
368 | 			errs = errwrap.Append(errs, err)
369 | 		}
370 | 	}
371 | 	return errs
372 | }
373 | 


--------------------------------------------------------------------------------
/iterator/iterator.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package iterator
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"io/fs"
 29 | 
 30 | 	"github.com/awslabs/yesiscan/interfaces"
 31 | 	"github.com/awslabs/yesiscan/util/safepath"
 32 | )
 33 | 
 34 | var (
 35 | 	// SkipPathExtensions is a list of file extensions to not scan. This
 36 | 	// list is alphabetical and has a comment for each element.
 37 | 	SkipPathExtensions = []string{
 38 | 		".bmp",       // image format
 39 | 		".csv",       // data format
 40 | 		".cvsignore", // csv ignore file
 41 | 		".doc",       // document format
 42 | 		".eps",       // image format
 43 | 		".gif",       // image format
 44 | 		".gitignore", // git ignore file
 45 | 		".jpeg",      // image format with weird naming
 46 | 		".jpg",       // image format
 47 | 		".ico",       // icon file format
 48 | 		".pdf",       // document format
 49 | 		".png",       // image format
 50 | 		".ppt",       // presentation format (microsoft)
 51 | 		".svg",       // image format
 52 | 		".odp",       // presentation format (libreoffice)
 53 | 		".ods",       // spreadsheet format (libreoffice)
 54 | 		".odt",       // document format (libreoffice)
 55 | 		".xls",       // spreadsheet format
 56 | 	}
 57 | 
 58 | 	// SkipDirPaths is a list of relative dir paths to not scan. This list
 59 | 	// list is alphabetical and has a comment for each element.
 60 | 	SkipDirPaths = []string{
 61 | 		".git/",    // internal git folder
 62 | 		".github/", // github specific stuff
 63 | 		".svn/",    // internal svn folder
 64 | 		//".eggs/", // python ??? directory
 65 | 	}
 66 | )
 67 | 
 68 | // SkipPath takes an input path and file info struct, and returns whether we
 69 | // should skip over it or not. To skip it, return true and no error. To skip a
 70 | // directory, return interfaces.SkipDir as the error. Lastly, if anything goes
 71 | // wrong, you can return your own error, but minimizing this chance is ideal.
 72 | // The stuff that gets skipped in here *must* be common for all iterators, as
 73 | // this function is shared by all of them. Individual backends can have their
 74 | // own file skip detection as well. For example, one particular backend might
 75 | // not know how to scan *.go files, where as a different one might specialize in
 76 | // this. Lastly, a design decision was made to make this a "pure, stateless"
 77 | // function. In other words, the decision to skip a file or not should be based
 78 | // entirely on the input arguments, and more complicated skip functions that
 79 | // might take into account more complex logic, such as the existence of multiple
 80 | // file paths is not possible. For example, if someone were to invent a file
 81 | // called `.legalignore` that worked like `.gitignore` but told software which
 82 | // files copyrights wouldn't apply from, we'd be unable to detect those and skip
 83 | // over them with this skip function since it only has a view into individual
 84 | // files and doesn't get a stateful, full directory tree view.
 85 | func SkipPath(path safepath.Path, info fs.FileInfo) (bool, error) {
 86 | 
 87 | 	// TODO: This could be built with a list of rules that we pass into the
 88 | 	// iterator, so that it could be configurable as needed.
 89 | 
 90 | 	if !path.IsAbs() { // the walk func gives us absolutes
 91 | 		return false, fmt.Errorf("path %s was not absolute", path.String())
 92 | 	}
 93 | 
 94 | 	if info.IsDir() { // path.IsDir()
 95 | 		absDir, ok := path.(safepath.AbsDir)
 96 | 		if !ok { // should not happen unless bug
 97 | 			return false, fmt.Errorf("expected AbsDir")
 98 | 		}
 99 | 
100 | 		for _, dir := range SkipDirPaths {
101 | 			relDir := safepath.UnsafeParseIntoRelDir(dir)
102 | 			if absDir.HasDir(relDir) {
103 | 				return true, interfaces.SkipDir
104 | 			}
105 | 		}
106 | 
107 | 		return false, nil // don't skip
108 | 	}
109 | 
110 | 	absFile, ok := path.(safepath.AbsFile)
111 | 	if !ok { // should not happen unless bug
112 | 		return false, fmt.Errorf("expected AbsFile")
113 | 	}
114 | 
115 | 	for _, ext := range SkipPathExtensions {
116 | 		// Make sure we have at least one char in the file name (x.foo)
117 | 		// and insensitive match on extensions like .foo that we skip.
118 | 		if absFile.HasExtInsensitive(ext) && len(ext) != len(absFile.Path()) { // case insensitive
119 | 			return true, nil
120 | 		}
121 | 	}
122 | 
123 | 	return false, nil // don't skip
124 | }
125 | 


--------------------------------------------------------------------------------
/iterator/util.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | package iterator
25 | 
26 | import (
27 | 	"strings"
28 | )
29 | 
30 | // WhichSuffix returns the first suffix with the longest match that is found in
31 | // the input string from the list provided. If none are found, then the empty
32 | // string is returned. The comparisons are done in lower case, but the returned
33 | // suffix is in the original case from the input list.
34 | func WhichSuffixInsensitive(s string, suffixList []string) string {
35 | 	suffix := ""
36 | 	length := 0
37 | 	for _, x := range suffixList {
38 | 		if strings.HasSuffix(strings.ToLower(s), strings.ToLower(x)) {
39 | 			if l := len(x); l > length {
40 | 				suffix = x
41 | 				length = l
42 | 			}
43 | 		}
44 | 	}
45 | 	return suffix
46 | }
47 | 


--------------------------------------------------------------------------------
/iterator/util_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | package iterator_test
25 | 
26 | import (
27 | 	"testing"
28 | 
29 | 	"github.com/awslabs/yesiscan/iterator"
30 | )
31 | 
32 | func TestWhichSuffixInsensitive(t *testing.T) {
33 | 	has := ".FoO"
34 | 	exp := ".foO"
35 | 	suffixList := []string{
36 | 		".fooo",
37 | 		exp,
38 | 		".bar",
39 | 		".BAZ",
40 | 	}
41 | 	if s := iterator.WhichSuffixInsensitive(has, suffixList); s != exp {
42 | 		t.Errorf("exp: %s", exp)
43 | 		t.Errorf("got: %s", s)
44 | 		return
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/iterator/zip.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package iterator
 25 | 
 26 | import (
 27 | 	"archive/zip"
 28 | 	"context"
 29 | 	"crypto/sha256"
 30 | 	"fmt"
 31 | 	"io"
 32 | 	"os"
 33 | 	"strconv"
 34 | 	"sync"
 35 | 	"time"
 36 | 
 37 | 	"github.com/awslabs/yesiscan/interfaces"
 38 | 	"github.com/awslabs/yesiscan/util/errwrap"
 39 | 	"github.com/awslabs/yesiscan/util/safepath"
 40 | )
 41 | 
 42 | const (
 43 | 	// ZipExtension is the standard extension used for zip URI's.
 44 | 	ZipExtension = ".zip"
 45 | 
 46 | 	// JarExtension is used for java .jar files. This is included here since
 47 | 	// they are just zip files that are named differently.
 48 | 	JarExtension = ".jar"
 49 | 
 50 | 	// WhlExtension is used for python .whl files. This is included here since
 51 | 	// they are just zip files that are named differently.
 52 | 	WhlExtension = ".whl"
 53 | )
 54 | 
 55 | var (
 56 | 	zipMapMutex *sync.Mutex
 57 | 	zipMutexes  map[string]*sync.Mutex
 58 | )
 59 | 
 60 | func init() {
 61 | 	zipMapMutex = &sync.Mutex{}
 62 | 	zipMutexes = make(map[string]*sync.Mutex)
 63 | }
 64 | 
 65 | // Zip is an iterator that takes a .zip URI to open and performs the unzip
 66 | // operation. It will eventually return an Fs iterator since there's no need for
 67 | // it to know how to walk through a filesystem tree itself. It can use a local
 68 | // cache so that future calls to the same URI won't have to waste cycles, but
 69 | // only in cases when we can determine it will be the same file.
 70 | type Zip struct {
 71 | 	Debug  bool
 72 | 	Logf   func(format string, v ...interface{})
 73 | 	Prefix safepath.AbsDir
 74 | 
 75 | 	// Parser is a pointer to the parser that returned this. If it wasn't
 76 | 	// returned by a parser, leave this nil. If this iterator came from an
 77 | 	// iterator, then the Iterator handle should be filled instead.
 78 | 	Parser interfaces.Parser
 79 | 
 80 | 	// Iterator is a pointer to the iterator that returned this. If it
 81 | 	// wasn't returned by an iterator, leave this nil. If this iterator came
 82 | 	// from a parser, then the Parser handle should be filled instead.
 83 | 	Iterator interfaces.Iterator
 84 | 
 85 | 	// Path is the location of the file to unzip.
 86 | 	Path safepath.AbsFile
 87 | 
 88 | 	// FIXME: add zip max file limit field to prevent zip bombs
 89 | 
 90 | 	// TODO: add zip password field
 91 | 
 92 | 	// AllowAnyExtension specifies whether we will attempt to run if the
 93 | 	// Path does not end with the correct zip extension.
 94 | 	AllowAnyExtension bool
 95 | 
 96 | 	// AllowedExtensions specifies a list of extensions that we are allowed
 97 | 	// to try to decode from. If this is empty, then we allow only the
 98 | 	// default of zip because allowing no extensions at all would make no
 99 | 	// sense. If AllowAnyExtension is set, then this has no effect. All the
100 | 	// matches are case insensitive.
101 | 	AllowedExtensions []string
102 | 
103 | 	// iterators store the list of which iterators we created, so we know
104 | 	// which ones we have to close!
105 | 	iterators []interfaces.Iterator
106 | 
107 | 	// unlock is a function that should be called as part of the Close
108 | 	// method once this resource is finished. It can be defined when
109 | 	// building this iterator in case we want a mechanism for the caller of
110 | 	// this iterator to tell the child when to unlock any in-use resources.
111 | 	// It must be safe to call this function more than once if necessary.
112 | 	// This is currently used privately.
113 | 	unlock func()
114 | }
115 | 
116 | // String returns a human-readable representation of the zip path we're looking
117 | // at. The output of this format is not guaranteed to be constant, so don't try
118 | // to parse it.
119 | func (obj *Zip) String() string {
120 | 	return fmt.Sprintf("zip: %s", obj.Path)
121 | }
122 | 
123 | // Validate runs some checks to ensure this iterator was built correctly.
124 | func (obj *Zip) Validate() error {
125 | 	if obj.Logf == nil {
126 | 		return fmt.Errorf("the Logf function must be specified")
127 | 	}
128 | 	if err := obj.Prefix.Validate(); err != nil {
129 | 		return err
130 | 	}
131 | 
132 | 	if obj.Path.Path() == "" {
133 | 		return fmt.Errorf("must specify a Path")
134 | 	}
135 | 
136 | 	return obj.validateExtension()
137 | }
138 | 
139 | // validateExtension is a helper function to process our extension validation.
140 | func (obj *Zip) validateExtension() error {
141 | 	if obj.AllowAnyExtension {
142 | 		return nil
143 | 	}
144 | 	if obj.Path.HasExtInsensitive(ZipExtension) && len(obj.AllowedExtensions) == 0 {
145 | 		return nil
146 | 	}
147 | 
148 | 	for _, x := range obj.AllowedExtensions {
149 | 		if obj.Path.HasExtInsensitive(x) {
150 | 			return nil
151 | 		}
152 | 	}
153 | 
154 | 	if len(obj.AllowedExtensions) == 0 {
155 | 		return fmt.Errorf("the zip extension is required without the allow any extension option")
156 | 	}
157 | 
158 | 	return fmt.Errorf("an allowed extension is required to run this iterator")
159 | }
160 | 
161 | // GetParser returns a handle to the parent parser that built this iterator if
162 | // there is one.
163 | func (obj *Zip) GetParser() interfaces.Parser { return obj.Parser }
164 | 
165 | // GetIterator returns a handle to the parent iterator that built this iterator
166 | // if there is one.
167 | func (obj *Zip) GetIterator() interfaces.Iterator { return obj.Iterator }
168 | 
169 | // Recurse runs a simple iterator that is responsible for unzipping a zip URI
170 | // into a local filesystem path. If this happens successfully, it will return a
171 | // new FsIterator that is initialized to this root path.
172 | func (obj *Zip) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) {
173 | 	relDir := safepath.UnsafeParseIntoRelDir("zip/")
174 | 	prefix := safepath.JoinToAbsDir(obj.Prefix, relDir)
175 | 	if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil {
176 | 		return nil, err
177 | 	}
178 | 
179 | 	// make a unique ID for the directory
180 | 	// XXX: we can consider different algorithms or methods here later...
181 | 	now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64
182 | 	sum := sha256.Sum256([]byte(obj.Path.Path() + now))
183 | 	hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum))
184 | 	if err != nil {
185 | 		return nil, err
186 | 	}
187 | 	// ensure it gets put into a folder so it doesn't explode current dir
188 | 	zipAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir)
189 | 
190 | 	zipMapMutex.Lock()
191 | 	mu, exists := zipMutexes[obj.Path.Path()]
192 | 	if !exists {
193 | 		mu = &sync.Mutex{}
194 | 		zipMutexes[obj.Path.Path()] = mu
195 | 	}
196 | 	zipMapMutex.Unlock()
197 | 
198 | 	if obj.Debug {
199 | 		obj.Logf("locking: %s", obj.String())
200 | 	}
201 | 	mu.Lock() // locking happens here (unlock on all errors/returns!)
202 | 	once := &sync.Once{}
203 | 	obj.unlock = func() {
204 | 		fn := func() {
205 | 			if obj.Debug {
206 | 				obj.Logf("unlocking: %s", obj.String())
207 | 			}
208 | 			mu.Unlock()
209 | 		}
210 | 		once.Do(fn)
211 | 	}
212 | 
213 | 	// XXX: unlock when context closes?
214 | 
215 | 	// XXX: If the destination dir has contents, consider removing them
216 | 	// first. This is one reason why we have a mutex.
217 | 
218 | 	// Open the zip archive for reading.
219 | 	// FIXME: use a variant that can take a context
220 | 	z, err := zip.OpenReader(obj.Path.Path())
221 | 	if err == zip.ErrFormat || err == zip.ErrAlgorithm || err == zip.ErrChecksum {
222 | 		obj.unlock()
223 | 		// Return an "iterator error" instead! This is a magic error
224 | 		// that tells the caller that we don't want to nuke the entire
225 | 		// scan for one unimportant error! Instead we bubble up and
226 | 		// collect this information to return to the user.
227 | 		return nil, &interfaces.IteratorError{
228 | 			Path: obj.Path.Path(),
229 | 			Err:  err,
230 | 		}
231 | 
232 | 	} else if err != nil {
233 | 		obj.unlock()
234 | 		return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path)
235 | 	}
236 | 	defer z.Close()
237 | 	if z.Comment != "" {
238 | 		obj.Logf("zip has comment: %s", z.Comment)
239 | 	}
240 | 
241 | 	filesTotal := 0
242 | 	bytesTotal := int64(0)
243 | 	// Iterate through the files in the archive.
244 | 	// XXX: can a child directory appear before a parent?
245 | 	// TODO: add a recurring progress logf if it takes longer than 30 sec
246 | 	for _, x := range z.File {
247 | 		// In an effort to short-circuit things if needed, we run a
248 | 		// check ourselves and break out early if we see that we have
249 | 		// cancelled early.
250 | 		select {
251 | 		case <-ctx.Done():
252 | 			obj.unlock()
253 | 			return nil, errwrap.Wrapf(ctx.Err(), "ended unzipping early")
254 | 		default:
255 | 		}
256 | 
257 | 		// TODO: obj.Debug ?
258 | 		obj.Logf("zip: %s", x.Name)
259 | 
260 | 		if x.FileInfo().IsDir() {
261 | 			relDir, err := safepath.ParseIntoRelDir(x.Name)
262 | 			if err != nil {
263 | 				// programming error
264 | 				obj.unlock()
265 | 				return nil, err
266 | 			}
267 | 
268 | 			// this is where the new dir will be created
269 | 			absDir := safepath.JoinToAbsDir(zipAbsDir, relDir)
270 | 
271 | 			// XXX: sanity check (is output in the dir?)
272 | 			// TODO: we could add this, but safepath automatically does this
273 | 			// if absDir is not inside of zipAbsDir then error
274 | 
275 | 			// XXX: which mode method?
276 | 			//if err := os.MkdirAll(absDir.Path(), x.Mode()); err != nil {
277 | 			if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil {
278 | 				// programming error
279 | 				obj.unlock()
280 | 				return nil, err
281 | 			}
282 | 
283 | 			continue
284 | 		}
285 | 
286 | 		relFile, err := safepath.ParseIntoRelFile(x.Name)
287 | 		if err != nil {
288 | 			// programming error
289 | 			obj.unlock()
290 | 			return nil, err
291 | 		}
292 | 
293 | 		// this is where the output file will be stored
294 | 		absFile := safepath.JoinToAbsFile(zipAbsDir, relFile)
295 | 
296 | 		// XXX: sanity check (is output in the dir?)
297 | 		// TODO: we could add this, but safepath automatically does this
298 | 		// if absFile is not inside of zipAbsDir then error
299 | 
300 | 		absDir := absFile.Dir() // get the absDir that absFile is in
301 | 
302 | 		// XXX: which mode to use? Maybe we are assuming a mode here
303 | 		// because we haven't seen that dir yet! Maybe if we pre-sort
304 | 		// all of the zip file entries first...
305 | 		//if err := os.MkdirAll(absDir.Path(), x.Mode()); err != nil {
306 | 		if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil {
307 | 			// programming error
308 | 			obj.unlock()
309 | 			return nil, err
310 | 		}
311 | 
312 | 		// write to this location
313 | 		// XXX: which mode method?
314 | 		//dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, x.Mode())
315 | 		dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
316 | 		if err != nil {
317 | 			obj.unlock()
318 | 			return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
319 | 		}
320 | 
321 | 		// open the actual source file
322 | 		f, err := x.Open()
323 | 		if err != nil {
324 | 			dest.Close() // close dest file on error!
325 | 			obj.unlock()
326 | 			return nil, errwrap.Wrapf(err, "error opening file %s", x.Name)
327 | 		}
328 | 		// don't `defer` close here because we want to free in the loop
329 | 
330 | 		// FIXME: use a variant that can take a context
331 | 		size, err := io.Copy(dest, f)
332 | 		if err != nil {
333 | 			f.Close()    // close file on error!
334 | 			dest.Close() // close dest file on error!
335 | 			obj.unlock()
336 | 			return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile)
337 | 		}
338 | 		obj.Logf("unzipped: %d bytes to disk at %s", size, absFile)
339 | 
340 | 		f.Close()    // close on success to save memory!
341 | 		dest.Close() // close dest file on error!
342 | 
343 | 		filesTotal++
344 | 		bytesTotal += int64(size)
345 | 	}
346 | 
347 | 	// TODO: change to human readable bytes
348 | 	obj.Logf("unzipped: %d files from %s into %s (%d bytes)", filesTotal, obj.String(), zipAbsDir, bytesTotal)
349 | 
350 | 	obj.iterators = []interfaces.Iterator{}
351 | 
352 | 	// if it's a single zip file we return an fs iterator and let the fs
353 | 	// iterator sort that out...
354 | 	iterator := &Fs{
355 | 		Debug: obj.Debug,
356 | 		Logf: func(format string, v ...interface{}) {
357 | 			obj.Logf(format, v...) // TODO: add a prefix?
358 | 		},
359 | 		Prefix: obj.Prefix,
360 | 
361 | 		Iterator: obj,
362 | 
363 | 		Path: zipAbsDir,
364 | 
365 | 		//Unlock: unlock,
366 | 	}
367 | 	obj.iterators = append(obj.iterators, iterator)
368 | 
369 | 	return obj.iterators, nil
370 | }
371 | 
372 | // Close shuts down the iterator and/or performs clean up after the Recurse
373 | // method has run. This must be called if you run Recurse.
374 | func (obj *Zip) Close() error {
375 | 	if obj.unlock != nil {
376 | 		obj.unlock()
377 | 	}
378 | 	var errs error
379 | 	for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!)
380 | 		if err := obj.iterators[i].Close(); err != nil {
381 | 			errs = errwrap.Append(errs, err)
382 | 		}
383 | 	}
384 | 	return errs
385 | }
386 | 


--------------------------------------------------------------------------------
/lib/profiles.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package lib
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"sort"
 29 | 	"strings"
 30 | 
 31 | 	"github.com/awslabs/yesiscan/interfaces"
 32 | 	"github.com/awslabs/yesiscan/util"
 33 | 	"github.com/awslabs/yesiscan/util/licenses"
 34 | 
 35 | 	colour "github.com/fatih/color"
 36 | )
 37 | 
 38 | const (
 39 | 	// UseColour specifies whether we use ANSI/HTML colours or not.
 40 | 	UseColour = true
 41 | 
 42 | 	// DefaultProfileName is the name given to the built-in "include all"
 43 | 	// profile.
 44 | 	DefaultProfileName = "default"
 45 | )
 46 | 
 47 | // ProfileConfig is the datastructure representing the profile config that is
 48 | // used for the .json files on disk.
 49 | type ProfileConfig struct {
 50 | 
 51 | 	// Licenses is the list of license SPDX ID's to match.
 52 | 	Licenses []string `json:"licenses"`
 53 | 
 54 | 	// Exclude these licenses from match instead of including by default.
 55 | 	Exclude bool `json:"exclude"`
 56 | 
 57 | 	// Comment adds a user friendly comment for this file.
 58 | 	Comment string `json:"comment"`
 59 | }
 60 | 
 61 | // ProfileData is the parsed version of ProfileConfig with real license structs.
 62 | type ProfileData struct {
 63 | 
 64 | 	// Licenses is the list of license SPDX ID's to match.
 65 | 	Licenses []*licenses.License
 66 | 
 67 | 	// Exclude these licenses from match instead of including by default.
 68 | 	Exclude bool
 69 | }
 70 | 
 71 | // SimpleProfiles is a simple way to filter the results. This is the first
 72 | // filter function created and is mostly used for an initial POC. It is the
 73 | // more complicated successor to the SimpleResults function. Style can be
 74 | // `ansi`, `html`, or `text`.
 75 | func SimpleProfiles(results interfaces.ResultSet, passes []string, warnings map[string]error, profile *ProfileData, summary bool, backendWeights map[interfaces.Backend]float64, style string) (string, error) {
 76 | 	if style != "ansi" && style != "html" && style != "text" {
 77 | 		return "", fmt.Errorf("invalid style: %s", style)
 78 | 	}
 79 | 
 80 | 	redString := func(format string, a ...interface{}) string {
 81 | 		if style == "ansi" {
 82 | 			return colour.New(colour.FgRed).Add(colour.Bold).Sprintf(format, a...)
 83 | 		}
 84 | 		if style == "html" {
 85 | 			return `<span style="color: red;">` + fmt.Sprintf(format, a...) + "</span>"
 86 | 		}
 87 | 		return fmt.Sprintf(format, a...)
 88 | 	}
 89 | 	boldString := func(format string, a ...interface{}) string {
 90 | 		if style == "ansi" {
 91 | 			return colour.New(colour.Bold).Sprintf(format, a...)
 92 | 		}
 93 | 		if style == "html" {
 94 | 			return `<span style="font-weight: bold;">` + fmt.Sprintf(format, a...) + "</span>"
 95 | 		}
 96 | 		return fmt.Sprintf(format, a...)
 97 | 	}
 98 | 	str := ""
 99 | 
100 | 	countStr := fmt.Sprintf("%d", len(passes))
101 | 	if len(passes) > 0 {
102 | 		countStr = redString(countStr)
103 | 	}
104 | 
105 | 	hasResults := false                  // do we have anything to show?
106 | 	licenseMap := make(map[string]int64) // for computing a summary
107 | 	errorMap := make(map[string]struct {
108 | 		backend string
109 | 		err     error
110 | 	}) // for recording found skip errors
111 | 	// XXX: handle dir's in here specially and merge in their weights with child paths!
112 | Loop:
113 | 	for uri, m := range results { // FIXME: sort and process properly
114 | 		bs := []*AnnotatedBackend{}
115 | 		ttl := 0.0      // total weight for the set of backends at this uri
116 | 		skipUri := true // assume we skip
117 | 		innerLicenseMap := make(map[string]int64)
118 | 		plus := func(name string) {
119 | 			val, _ := innerLicenseMap[name] // defaults to zero!
120 | 			innerLicenseMap[name] = val + 1
121 | 		}
122 | 		for backend, result := range m {
123 | 			if result.Skip != nil {
124 | 				errorMap[uri] = struct {
125 | 					backend string
126 | 					err     error
127 | 				}{
128 | 					backend: backend.String(),
129 | 					err:     result.Skip,
130 | 				}
131 | 			}
132 | 			// accounting for licenses summary
133 | 			for _, x := range result.Licenses {
134 | 				plus(x.String())
135 | 			}
136 | 
137 | 			if profile == nil {
138 | 				skipUri = false
139 | 			} else {
140 | 				// TODO: memoize this for performance
141 | 				count := len(licenses.Union(profile.Licenses, result.Licenses))
142 | 				// are there licenses that match in our profile?
143 | 				if count > 0 && !profile.Exclude {
144 | 					skipUri = false
145 | 				}
146 | 
147 | 				// are there licenses we didn't account for?
148 | 				if len(result.Licenses) > count && profile.Exclude {
149 | 					skipUri = false
150 | 				}
151 | 			}
152 | 
153 | 			weight, exists := backendWeights[backend]
154 | 			if !exists {
155 | 				return "", fmt.Errorf("no weight found for backend: %s", backend.String())
156 | 			}
157 | 			b := &AnnotatedBackend{
158 | 				Backend: backend,
159 | 				Weight:  weight,
160 | 			}
161 | 			bs = append(bs, b)
162 | 			ttl += weight
163 | 		}
164 | 		if skipUri { // we don't want to display this Uri (this file)
165 | 			continue Loop
166 | 		}
167 | 		f := 0.0 // NOTE: confidence *if* the different results agree!
168 | 		//for backend, result := range m {
169 | 		for _, b := range bs { // for backend, result := range m
170 | 			backend := b.Backend
171 | 			weight := b.Weight // backendWeights[backend]
172 | 			result := m[backend]
173 | 			scale := weight / ttl
174 | 			b.ScaledConfidence = result.Confidence * scale
175 | 			f = f + b.ScaledConfidence
176 | 		}
177 | 
178 | 		// merge into to parent accounting
179 | 		for k, v := range innerLicenseMap { // map[string]int64
180 | 			val, _ := licenseMap[k] // defaults to zero!
181 | 			licenseMap[k] = val + v
182 | 		}
183 | 
184 | 		// start table row here after the above continue...
185 | 		if style == "html" {
186 | 			str += "<tr><td>"
187 | 		}
188 | 
189 | 		sort.Sort(sort.Reverse(SortedBackends(bs)))
190 | 		smartURI := util.SmartURI(uri) // make it useful to click on
191 | 		if style == "ansi" {
192 | 			hyperlink := util.ShellHyperlinkEncode(uri, smartURI)
193 | 			str += fmt.Sprintf("%s (%.2f%%)\n", hyperlink, f*100.0)
194 | 		}
195 | 		if style == "html" {
196 | 			hyperlink := util.HtmlHyperlinkEncode(uri, smartURI)
197 | 			str += fmt.Sprintf("%s (%.2f%%)", hyperlink, f*100.0)
198 | 		}
199 | 		if style == "text" {
200 | 			// TODO: can we do better for text output?
201 | 			str += fmt.Sprintf("%s (%.2f%%)\n", uri, f*100.0)
202 | 		}
203 | 		hasResults = true
204 | 
205 | 		if style == "html" {
206 | 			str += "<ul>"
207 | 		}
208 | 		for _, b := range bs { // for backend, result := range m
209 | 			backend := b.Backend
210 | 			weight := b.Weight // backendWeights[backend]
211 | 			result := m[backend]
212 | 
213 | 			l := licenses.Join(result.Licenses)
214 | 			if UseColour && profile != nil {
215 | 				ll := []string{}
216 | 				// only colour the matched ones!
217 | 				for _, x := range result.Licenses {
218 | 					r := x.String()
219 | 					inList := licenses.InList(x, profile.Licenses)
220 | 					if inList && !profile.Exclude || !inList && profile.Exclude {
221 | 						r = x.String()
222 | 						r = redString(r)
223 | 					}
224 | 
225 | 					ll = append(ll, r)
226 | 				}
227 | 				l = strings.Join(ll, ", ")
228 | 			}
229 | 
230 | 			s := ""
231 | 			if style == "ansi" {
232 | 				s = fmt.Sprintf("    %s (%.2f/%.2f)  %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0)
233 | 			}
234 | 			if style == "html" {
235 | 				s = fmt.Sprintf("<li>%s (%.2f/%.2f) %s (%.2f%%)</li>", backend.String(), weight, ttl, l, result.Confidence*100.0)
236 | 			}
237 | 			if style == "text" {
238 | 				s = fmt.Sprintf("    %s (%.2f/%.2f)  %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0)
239 | 			}
240 | 
241 | 			str += s
242 | 			hasResults = true
243 | 			if !debug {
244 | 				continue
245 | 			}
246 | 			it := result.Meta.Iterator // at least one must be present
247 | 			for {
248 | 				str += fmt.Sprintf("        %s\n", it)
249 | 				hasResults = true
250 | 				newIt := it.GetIterator()
251 | 				if newIt == nil {
252 | 					break
253 | 				}
254 | 				it = newIt
255 | 			}
256 | 			if parser := it.GetParser(); parser != nil {
257 | 				str += fmt.Sprintf("            %s\n", parser)
258 | 				hasResults = true
259 | 			}
260 | 		}
261 | 		if style == "html" {
262 | 			str += "</ul>"
263 | 			str += "</td></tr>"
264 | 		}
265 | 	}
266 | 
267 | 	skippedStr := ""
268 | 	if style == "ansi" {
269 | 		skippedStr = fmt.Sprintf("skipped: %s files/directories\n", countStr)
270 | 	}
271 | 	if style == "html" {
272 | 		s := `<tr><td><table id="summary">`
273 | 		s += fmt.Sprintf("<tr><th>skipped: %s files/directories</th></tr>", countStr)
274 | 		s += "</table></td></tr>"
275 | 		skippedStr = s
276 | 	}
277 | 	if style == "text" {
278 | 		skippedStr = fmt.Sprintf("skipped: %d files/directories\n", countStr)
279 | 	}
280 | 
281 | 	erroredStr := ""
282 | 	if len(errorMap) > 0 { // keep it in scope
283 | 		names := []string{}
284 | 		for k := range errorMap { // map[string]error
285 | 			names = append(names, k)
286 | 		}
287 | 		sort.Strings(names)
288 | 		if style == "ansi" || style == "text" {
289 | 			s := "errors:\n"
290 | 			for _, x := range names {
291 | 				s += fmt.Sprintf("%s: %s (%s)\n", x, redString(errorMap[x].err.Error()), errorMap[x].backend)
292 | 			}
293 | 			erroredStr = s
294 | 		}
295 | 		if style == "html" {
296 | 			s := `<tr><td><table id="summary">`
297 | 			s += `<tr><th colspan="2">errors:</th></tr>`
298 | 			for _, x := range names {
299 | 				s += fmt.Sprintf("<tr><td>%s</td><td>%s (%s)</td></tr>", x, redString(errorMap[x].err.Error()), errorMap[x].backend)
300 | 			}
301 | 
302 | 			s += "</table></td></tr>"
303 | 			erroredStr = s
304 | 		}
305 | 	}
306 | 
307 | 	warningStr := ""
308 | 	if len(warnings) > 0 { // keep it in scope
309 | 		names := []string{}
310 | 		for k := range warnings { // map[string]error
311 | 			names = append(names, k)
312 | 		}
313 | 		sort.Strings(names)
314 | 		if style == "ansi" || style == "text" {
315 | 			s := "errors:\n"
316 | 			for _, x := range names {
317 | 				s += fmt.Sprintf("%s: %s\n", x, redString(warnings[x].Error()))
318 | 			}
319 | 			warningStr = s
320 | 		}
321 | 		if style == "html" {
322 | 			s := `<tr><td><table id="summary">`
323 | 			s += `<tr><th colspan="2">errors:</th></tr>`
324 | 			for _, x := range names {
325 | 				s += fmt.Sprintf("<tr><td>%s</td><td>%s</td></tr>", x, redString(warnings[x].Error()))
326 | 			}
327 | 
328 | 			s += "</table></td></tr>"
329 | 			warningStr = s
330 | 		}
331 | 	}
332 | 
333 | 	noResultsStr := ""
334 | 	if !hasResults {
335 | 		noResultsStr = "<no results>"
336 | 		if style == "html" {
337 | 			s := `<tr><td><table id="summary">`
338 | 			s += "<tr><th>no results</th></tr>"
339 | 			s += "</table></td></tr>"
340 | 			noResultsStr = s
341 | 		}
342 | 	}
343 | 
344 | 	summaryStr := ""
345 | 	if summary {
346 | 		names := []string{}
347 | 		for k := range licenseMap { // map[string]int64
348 | 			names = append(names, k)
349 | 		}
350 | 		sort.Strings(names)
351 | 		if style == "ansi" || style == "text" {
352 | 			s := boldString("summary:") + "\n"
353 | 			for _, x := range names {
354 | 				s += fmt.Sprintf("%s: %d\n", x, licenseMap[x])
355 | 			}
356 | 			summaryStr = s
357 | 		}
358 | 		if style == "html" {
359 | 			s := `<tr><td><table id="summary">`
360 | 			s += fmt.Sprintf(`<tr><th colspan="2">%s</th></tr>`, boldString("summary:"))
361 | 			for _, x := range names {
362 | 				s += fmt.Sprintf("<tr><td>%s</td><td>%d</td></tr>", x, licenseMap[x])
363 | 			}
364 | 
365 | 			s += "</table></td></tr>"
366 | 			summaryStr = s
367 | 		}
368 | 	}
369 | 
370 | 	if !hasResults {
371 | 		summaryStr = ""
372 | 	}
373 | 	// glue it all together
374 | 	str = skippedStr + warningStr + erroredStr + summaryStr + noResultsStr + str
375 | 
376 | 	return str, nil
377 | }
378 | 


--------------------------------------------------------------------------------
/lib/results.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package lib
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"sort"
 29 | 
 30 | 	"github.com/awslabs/yesiscan/interfaces"
 31 | 	"github.com/awslabs/yesiscan/util"
 32 | 	"github.com/awslabs/yesiscan/util/licenses"
 33 | )
 34 | 
 35 | const (
 36 | 	debug = false
 37 | )
 38 | 
 39 | type AnnotatedBackend struct {
 40 | 	Backend          interfaces.Backend
 41 | 	Weight           float64
 42 | 	ScaledConfidence float64
 43 | }
 44 | 
 45 | type SortedBackends []*AnnotatedBackend
 46 | 
 47 | func (obj SortedBackends) Len() int      { return len(obj) }
 48 | func (obj SortedBackends) Swap(i, j int) { obj[i], obj[j] = obj[j], obj[i] }
 49 | func (obj SortedBackends) Less(i, j int) bool {
 50 | 	return obj[i].ScaledConfidence < obj[j].ScaledConfidence
 51 | }
 52 | 
 53 | //func (obj SortedBackends) Sort() { sort.Sort(obj) }
 54 | 
 55 | // SimpleResults is a simple way to format the results. This is the first
 56 | // display function created and is mostly used for debugging and initial POC.
 57 | func SimpleResults(results interfaces.ResultSet, backendWeights map[interfaces.Backend]float64) (string, error) {
 58 | 	if len(results) == 0 {
 59 | 		return "", fmt.Errorf("no results obtained")
 60 | 	}
 61 | 
 62 | 	str := ""
 63 | 	// XXX: handle dir's in here specially and merge in their weights with child paths!
 64 | 	for uri, m := range results { // FIXME: sort and process properly
 65 | 		bs := []*AnnotatedBackend{}
 66 | 		ttl := 0.0 // total weight for the set of backends at this uri
 67 | 		for backend := range m {
 68 | 			weight, exists := backendWeights[backend]
 69 | 			if !exists {
 70 | 				return "", fmt.Errorf("no weight found for backend: %s", backend.String())
 71 | 			}
 72 | 			b := &AnnotatedBackend{
 73 | 				Backend: backend,
 74 | 				Weight:  weight,
 75 | 			}
 76 | 			bs = append(bs, b)
 77 | 			ttl += weight
 78 | 		}
 79 | 		f := 0.0 // NOTE: confidence *if* the different results agree!
 80 | 		//for backend, result := range m {
 81 | 		for _, b := range bs { // for backend, result := range m
 82 | 			backend := b.Backend
 83 | 			weight := b.Weight // backendWeights[backend]
 84 | 			result := m[backend]
 85 | 			scale := weight / ttl
 86 | 			b.ScaledConfidence = result.Confidence * scale
 87 | 			f = f + b.ScaledConfidence
 88 | 		}
 89 | 
 90 | 		sort.Sort(sort.Reverse(SortedBackends(bs)))
 91 | 		display := uri // show the URI
 92 | 		smartURI := util.SmartURI(uri)
 93 | 		hyperlink := util.ShellHyperlinkEncode(display, smartURI)
 94 | 		str += fmt.Sprintf("%s (%.2f%%)\n", hyperlink, f*100.0)
 95 | 		for _, b := range bs { // for backend, result := range m
 96 | 			backend := b.Backend
 97 | 			weight := b.Weight // backendWeights[backend]
 98 | 			result := m[backend]
 99 | 			l := licenses.Join(result.Licenses)
100 | 			str += fmt.Sprintf("    %s (%.2f/%.2f)  %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0)
101 | 			if !debug {
102 | 				continue
103 | 			}
104 | 			it := result.Meta.Iterator // at least one must be present
105 | 			for {
106 | 				str += fmt.Sprintf("        %s\n", it)
107 | 				newIt := it.GetIterator()
108 | 				if newIt == nil {
109 | 					break
110 | 				}
111 | 				it = newIt
112 | 			}
113 | 			if parser := it.GetParser(); parser != nil {
114 | 				str += fmt.Sprintf("            %s\n", parser)
115 | 			}
116 | 		}
117 | 	}
118 | 	return str, nil
119 | }
120 | 


--------------------------------------------------------------------------------
/parser/parser.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package parser
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"net/url"
 29 | 	"os"
 30 | 	"path/filepath"
 31 | 	"strings"
 32 | 
 33 | 	"github.com/awslabs/yesiscan/interfaces"
 34 | 	"github.com/awslabs/yesiscan/iterator"
 35 | 	"github.com/awslabs/yesiscan/util/errwrap"
 36 | 	"github.com/awslabs/yesiscan/util/safepath"
 37 | 	"github.com/go-git/go-git/v5/plumbing"
 38 | )
 39 | 
 40 | // TrivialURIParser takes input as a single string. It expects either a URL or a
 41 | // Path component as the input.
 42 | type TrivialURIParser struct {
 43 | 	Debug  bool
 44 | 	Logf   func(format string, v ...interface{})
 45 | 	Prefix safepath.AbsDir
 46 | 
 47 | 	Input string
 48 | }
 49 | 
 50 | func (obj *TrivialURIParser) String() string {
 51 | 	return fmt.Sprintf("trivialuriparser(%s)", obj.Input)
 52 | }
 53 | 
 54 | func (obj *TrivialURIParser) Parse() ([]interfaces.Iterator, error) {
 55 | 	if obj.Input == "" {
 56 | 		return nil, fmt.Errorf("empty input")
 57 | 	}
 58 | 
 59 | 	iterators := []interfaces.Iterator{}
 60 | 
 61 | 	// NOTE: it's unlikely that the url.Parse method ever errors.
 62 | 	u, err := url.Parse(obj.Input)
 63 | 	if err != nil {
 64 | 		return nil, errwrap.Wrapf(err, "could not parse URL")
 65 | 	}
 66 | 	s := u.String()
 67 | 
 68 | 	if obj.Debug {
 69 | 		obj.Logf("scheme: %s", u.Scheme)
 70 | 		obj.Logf("host: %s", u.Host)
 71 | 		obj.Logf("path: %s", u.Path)
 72 | 	}
 73 | 
 74 | 	// TODO: consider allowing HttpSchemeRaw as well (with a flag)
 75 | 	if strings.ToLower(u.Scheme) == iterator.HttpSchemeRaw {
 76 | 		return nil, fmt.Errorf("plain http is currently blocked, did you mean https?")
 77 | 	}
 78 | 
 79 | 	// this is a bit of a heuristic, but we'll go with it for now
 80 | 	// this is because we get https:// urls that are really github git URI's
 81 | 	isTar := strings.HasSuffix(strings.ToLower(s), iterator.TarExtension)
 82 | 	if strings.ToLower(u.Scheme) == iterator.HttpsSchemeRaw && (isZip(s) || isGzip(s) || isTar || isBzip2(s)) {
 83 | 		iterator := &iterator.Http{
 84 | 			Debug: obj.Debug,
 85 | 			Logf: func(format string, v ...interface{}) {
 86 | 				obj.Logf("iterator: "+format, v...)
 87 | 			},
 88 | 			Prefix:    obj.Prefix,
 89 | 			URL:       s,     // TODO: pass a *net.URL instead?
 90 | 			AllowHttp: false, // allow non-https ?
 91 | 
 92 | 			Parser: obj, // store a handle to the originator
 93 | 		}
 94 | 		iterators = append(iterators, iterator)
 95 | 		return iterators, nil
 96 | 	}
 97 | 
 98 | 	if isGit(u) {
 99 | 		// TODO: for now, just assume it can only be a git iterator...
100 | 		// Checking if commit hash exists at the end of the URL.
101 | 		// examples of URLs of different hosts containing commit hashes:
102 | 		// github: https://github.com/awslabs/yesiscan/commit/496d080bc7fe835511d7220f127e118d0881b792
103 | 		// webrtc: https://webrtc.googlesource.com/src.git/+/c276aee4eda7b1a466b139838f20e790bd746309
104 | 		// TODO: Might need to be generalized in the future as we add more URL patterns.
105 | 		hash := ""
106 | 		index := strings.LastIndex(u.Path, "/")
107 | 		pathSuffix := u.Path[index+1:]
108 | 		if plumbing.IsHash(pathSuffix) {
109 | 			hash = pathSuffix
110 | 			// Here we are removing the parts of the URL which are there because
111 | 			// of a commit hash such that the repository can be cloned properly.
112 | 			u.Path = u.Path[:index]
113 | 			index := strings.LastIndex(u.Path, "/")
114 | 			u.Path = u.Path[:index]
115 | 			s = u.String()
116 | 		}
117 | 		iterator := &iterator.Git{
118 | 			Debug: obj.Debug,
119 | 			Logf: func(format string, v ...interface{}) {
120 | 				obj.Logf("iterator: "+format, v...)
121 | 			},
122 | 			Prefix:        obj.Prefix,
123 | 			URL:           s, // TODO: pass a *net.URL instead?
124 | 			TrimGitSuffix: true,
125 | 			Hash:          hash,
126 | 			Parser:        obj, // store a handle to the originator
127 | 		}
128 | 		iterators = append(iterators, iterator)
129 | 		return iterators, nil
130 | 	}
131 | 
132 | 	// path component (absolute or relative, file or dir)
133 | 	if u.Scheme == "" {
134 | 		// XXX: we could auto-detect the dir bit
135 | 		isDir := strings.HasSuffix(obj.Input, "/")
136 | 		info, err := os.Stat(obj.Input) // XXX: stat or Lstat?
137 | 		if err != nil {
138 | 			return nil, err
139 | 		}
140 | 		if isDir != info.IsDir() {
141 | 			return nil, fmt.Errorf("input path must end with a trailing slash if it's a dir")
142 | 		}
143 | 
144 | 		p, err := filepath.Abs(obj.Input)
145 | 		if err != nil {
146 | 			return nil, err
147 | 		}
148 | 		if isDir {
149 | 			p += "/" // filepath.Abs calls filepath.Clean which strips this
150 | 		}
151 | 
152 | 		path, err := safepath.ParseIntoPath(p, isDir)
153 | 		if err != nil {
154 | 			return nil, err
155 | 		}
156 | 		iterator := &iterator.Fs{
157 | 			Debug: obj.Debug,
158 | 			Logf: func(format string, v ...interface{}) {
159 | 				obj.Logf("iterator: "+format, v...)
160 | 			},
161 | 			Prefix: obj.Prefix,
162 | 			Path:   path,
163 | 
164 | 			Parser: obj, // store a handle to the originator
165 | 		}
166 | 		iterators = append(iterators, iterator)
167 | 		return iterators, nil
168 | 	}
169 | 
170 | 	obj.Logf("i'm not sure how to parse this URI, please report this if you think I should be able to!")
171 | 	return nil, fmt.Errorf("i'm not sure how to parse this uri")
172 | }
173 | 
174 | // isGit is a small helper to decide if we should run the git iterator or not.
175 | // TODO: we should expand this function as it's a heuristic. maybe we can do
176 | // better overall and not need a heuristic. time will tell...
177 | func isGit(u *url.URL) bool {
178 | 	if strings.ToLower(u.Scheme) == iterator.GitSchemeRaw {
179 | 		return true
180 | 	}
181 | 	if strings.ToLower(u.Scheme) == iterator.HttpsSchemeRaw {
182 | 		hosts := []string{"github.com", "webrtc.googlesource.com"}
183 | 		urlHost := strings.ToLower(u.Host)
184 | 		for _, host := range hosts {
185 | 			if urlHost == host {
186 | 				return true
187 | 			}
188 | 		}
189 | 	}
190 | 
191 | 	return false
192 | }
193 | 
194 | // isZip is a helper method to determine whether a string has a Zip extension
195 | // suffix.
196 | func isZip(input string) bool {
197 | 	extensions := []string{iterator.ZipExtension, iterator.JarExtension, iterator.WhlExtension}
198 | 	for _, extension := range extensions {
199 | 		if strings.HasSuffix(strings.ToLower(input), extension) {
200 | 			return true
201 | 		}
202 | 	}
203 | 	return false
204 | }
205 | 
206 | // isGzip is a helper method to determine whether a string has a Gzip extension
207 | // suffix.
208 | func isGzip(input string) bool {
209 | 	for _, extension := range iterator.GzipExtensions {
210 | 		if strings.HasSuffix(strings.ToLower(input), extension) {
211 | 			return true
212 | 		}
213 | 	}
214 | 	return false
215 | }
216 | 
217 | // isBzip2 is a helper method to determine whether a string has a Bzip2
218 | // extension suffix.
219 | func isBzip2(input string) bool {
220 | 	for _, extension := range iterator.Bzip2Extensions {
221 | 		if strings.HasSuffix(strings.ToLower(input), extension) {
222 | 			return true
223 | 		}
224 | 	}
225 | 	return false
226 | }
227 | 


--------------------------------------------------------------------------------
/s3/s3.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package s3
 25 | 
 26 | import (
 27 | 	"bytes"
 28 | 	"context"
 29 | 	"crypto/md5"
 30 | 	"encoding/base64"
 31 | 	"errors"
 32 | 	"fmt"
 33 | 	"io"
 34 | 	"time"
 35 | 
 36 | 	"github.com/awslabs/yesiscan/util/errwrap"
 37 | 
 38 | 	"github.com/aws/aws-sdk-go-v2/aws"
 39 | 	s3config "github.com/aws/aws-sdk-go-v2/config"
 40 | 	"github.com/aws/aws-sdk-go-v2/service/s3"
 41 | 	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
 42 | )
 43 | 
 44 | const (
 45 | 	// GrantReadAllUsers is the constant used to give read access to all.
 46 | 	GrantReadAllUsers = "uri=http://acs.amazonaws.com/groups/global/AllUsers"
 47 | 
 48 | 	// DefaultRegion is a region to use if none are specified.
 49 | 	DefaultRegion = "ca-central-1" // yul
 50 | )
 51 | 
 52 | // PubURL returns the public URL for an object in a given region and bucket.
 53 | // This depends on you setting the appropriate permissions and choosing valid
 54 | // input parameters. No validation is done, this is just templating.
 55 | func PubURL(region, bucket, object string) string {
 56 | 	return fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", bucket, region, object)
 57 | }
 58 | 
 59 | // Inputs is the set of information required to use the Store method.
 60 | type Inputs struct {
 61 | 	// Region is the region where we will push the data.
 62 | 	Region string
 63 | 
 64 | 	// BucketName is the name of the bucket.
 65 | 	BucketName string
 66 | 
 67 | 	// CreateBucket is true if we wish to create the bucket if it's missing.
 68 | 	CreateBucket bool
 69 | 
 70 | 	// ObjectName is the name of the object.
 71 | 	ObjectName string
 72 | 
 73 | 	// GrantReadAllUsers specifies that all users read access will be set on
 74 | 	// this object. Only use this if you are certain you want anyone on the
 75 | 	// internet to be able to read this object.
 76 | 	GrantReadAllUsers bool
 77 | 
 78 | 	// ContentType is what is set for the object if it is non-nil.
 79 | 	ContentType *string
 80 | 
 81 | 	// Data is the actual data to store.
 82 | 	Data []byte
 83 | 
 84 | 	Debug bool
 85 | 	Logf  func(format string, v ...interface{})
 86 | }
 87 | 
 88 | // Store takes some inputs and stores the data into s3. If successful, it
 89 | // returns a presign URL that can be shared to give access to the object. If you
 90 | // chose to make the object public, then it can also be accessed using the
 91 | // well-known public URL as obtained by the PubURL function. This depends on you
 92 | // having appropriate AWS credentials set up on your machine for the account you
 93 | // want to use.
 94 | func Store(ctx context.Context, inputs *Inputs) (string, error) {
 95 | 	if inputs.Debug {
 96 | 		inputs.Logf("begin s3...")
 97 | 		defer inputs.Logf("done s3")
 98 | 	}
 99 | 
100 | 	// TODO: check if region is valid?
101 | 	if inputs.Region == "" {
102 | 		return "", fmt.Errorf("empty region")
103 | 	}
104 | 
105 | 	cfg, err := s3config.LoadDefaultConfig(ctx, s3config.WithRegion(inputs.Region))
106 | 	if err != nil {
107 | 		return "", errwrap.Wrapf(err, "config error")
108 | 	}
109 | 	cfg.Region = inputs.Region
110 | 	client := s3.NewFromConfig(cfg)
111 | 
112 | 	if inputs.CreateBucket {
113 | 		if inputs.Debug {
114 | 			inputs.Logf("creating bucket...")
115 | 		}
116 | 		createBucketInput := &s3.CreateBucketInput{
117 | 			Bucket: &inputs.BucketName,
118 | 
119 | 			// The configuration information for the bucket.
120 | 			CreateBucketConfiguration: &s3types.CreateBucketConfiguration{
121 | 				// Specifies the Region where the bucket will be
122 | 				// created. If you don't specify a Region, the
123 | 				// bucket is created in the US East
124 | 				// (N. Virginia) Region (us-east-1).
125 | 				//LocationConstraint: s3types.BucketLocationConstraintCaCentral1,
126 | 				// it's a string region
127 | 				LocationConstraint: s3types.BucketLocationConstraint(inputs.Region),
128 | 			},
129 | 		}
130 | 
131 | 		_, err := client.CreateBucket(ctx, createBucketInput)
132 | 		//*CreateBucketOutput
133 | 		if err == nil {
134 | 			inputs.Logf("bucket created")
135 | 		}
136 | 
137 | 		// ignore the error if it shows bucket already exists
138 | 		var bucketErr error
139 | 		for err != nil {
140 | 			bucketErr = err // we have an error!
141 | 			if _, ok := err.(*s3types.BucketAlreadyOwnedByYou); ok {
142 | 				bucketErr = nil // ignore me!
143 | 				break
144 | 			}
145 | 			err = errors.Unwrap(err)
146 | 		}
147 | 		if bucketErr != nil {
148 | 			return "", errwrap.Wrapf(bucketErr, "bucket creation issue")
149 | 		}
150 | 		if inputs.Debug {
151 | 			inputs.Logf("bucket should exist")
152 | 		}
153 | 	}
154 | 
155 | 	body := bytes.NewReader(inputs.Data) // support seek
156 | 
157 | 	// we hash this to make idempotent puts avoid copying the data again...
158 | 	h := md5.New()
159 | 	if _, err := io.Copy(h, body); err != nil {
160 | 		return "", errwrap.Wrapf(err, "copy to hash error")
161 | 	}
162 | 	// rewind after hashing
163 | 	if _, err := body.Seek(0, io.SeekStart); err != nil {
164 | 		return "", errwrap.Wrapf(err, "seek error")
165 | 	}
166 | 
167 | 	md5s := base64.StdEncoding.EncodeToString(h.Sum(nil))
168 | 	if inputs.Debug {
169 | 		inputs.Logf("md5s: %s", md5s)
170 | 	}
171 | 
172 | 	putObjectInput := &s3.PutObjectInput{
173 | 		Bucket: &inputs.BucketName, // this member is required
174 | 
175 | 		Key: &inputs.ObjectName, // this member is required
176 | 
177 | 		// For using values that are not seekable (io.Seeker) see,
178 | 		// https://aws.github.io/aws-sdk-go-v2/docs/sdk-utilisties/s3/#unseekable-streaming-input
179 | 		Body: body, // io.Reader
180 | 
181 | 		ContentMD5: &md5s,
182 | 
183 | 		ContentType: inputs.ContentType,
184 | 
185 | 		StorageClass: s3types.StorageClassStandard,
186 | 	}
187 | 	if inputs.GrantReadAllUsers { // give all users on internet read access!
188 | 		putObjectInput.GrantRead = aws.String(GrantReadAllUsers)
189 | 	}
190 | 
191 | 	inputs.Logf("putting object...")
192 | 	if _, err := client.PutObject(ctx, putObjectInput); err != nil {
193 | 		return "", errwrap.Wrapf(err, "put error")
194 | 	}
195 | 
196 | 	// X-Amz-Expires must be less than a week (in seconds); that is, the
197 | 	// given X-Amz-Expires must be less than 604800 seconds. (equal is okay)
198 | 	// TODO: i suppose we could allow the user to specify the expiry time,
199 | 	// but the maximum is so short, we'll hardcode this in here for now.
200 | 	presignClient := s3.NewPresignClient(client, s3.WithPresignExpires(7*24*time.Hour))
201 | 
202 | 	presignResult, err := presignClient.PresignGetObject(ctx, &s3.GetObjectInput{
203 | 		Bucket: aws.String(inputs.BucketName),
204 | 		Key:    aws.String(inputs.ObjectName),
205 | 	})
206 | 
207 | 	if err != nil {
208 | 		return "", errwrap.Wrapf(err, "presign error")
209 | 	}
210 | 
211 | 	return presignResult.URL, nil
212 | }
213 | 


--------------------------------------------------------------------------------
/s3/screenshot-s3-public-bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/s3/screenshot-s3-public-bucket.png


--------------------------------------------------------------------------------
/util/ansi/ansi.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package ansi
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"os"
 29 | 	"strings"
 30 | 	"sync"
 31 | 
 32 | 	"golang.org/x/term"
 33 | )
 34 | 
 35 | // Logf is a complex printing thing to do some ansi terminal escape sequence
 36 | // magic.
 37 | // FIXME: there might be bugs if Ellipsis is very big and Width is very small.
 38 | type Logf struct {
 39 | 	// Prefix is a prefix to append to each message. You can leave this
 40 | 	// empty.
 41 | 	Prefix string
 42 | 
 43 | 	// Ellipsis is what is appended to the end of each message when
 44 | 	// truncating. You can leave this empty.
 45 | 	Ellipsis string
 46 | 
 47 | 	// Enable specifies whether you want to turn this on or not.
 48 | 	Enable bool
 49 | 
 50 | 	// Prefixes are a list of string prefixes to match when deciding to
 51 | 	// delete a previous entry.
 52 | 	Prefixes []string
 53 | 
 54 | 	mutex      *sync.Mutex
 55 | 	previous   string
 56 | 	isTerminal bool
 57 | 	width      int
 58 | }
 59 | 
 60 | // Init must be called once before Logf is used. As a convenience, this returns
 61 | // the Logf function that you should use!
 62 | func (obj *Logf) Init() func(format string, v ...interface{}) {
 63 | 	obj.mutex = &sync.Mutex{}
 64 | 	//obj.previous = ""
 65 | 	obj.isTerminal = term.IsTerminal(0)
 66 | 	var err error
 67 | 	obj.width, _, err = term.GetSize(0)
 68 | 	if err != nil {
 69 | 		obj.isTerminal = false // keep it simple, who cares
 70 | 	}
 71 | 
 72 | 	return obj.Logf
 73 | }
 74 | 
 75 | // Logf is the actual Logf function you should use. You must run Init before
 76 | // you use this.
 77 | func (obj *Logf) Logf(format string, v ...interface{}) {
 78 | 	s := fmt.Sprintf(format, v...)
 79 | 
 80 | 	if obj.isTerminal {
 81 | 		// TODO: what about multi-char width UTF-8 stuff?
 82 | 		if len(s) > obj.width-len(obj.Prefix) { // truncate/ellipsize
 83 | 			s = s[0:obj.width-len(obj.Prefix)-len(obj.Ellipsis)] + obj.Ellipsis
 84 | 		}
 85 | 	}
 86 | 	s = s + "\n" // add the newline in
 87 | 
 88 | 	obj.mutex.Lock() // for safety
 89 | 	validPrefix := false
 90 | 	for _, p := range obj.Prefixes {
 91 | 		b := strings.HasPrefix(obj.previous, p)
 92 | 		validPrefix = validPrefix || b
 93 | 	}
 94 | 
 95 | 	if obj.Enable && obj.previous != "" && validPrefix {
 96 | 		// move up 1 line, clear to left
 97 | 		fmt.Fprint(os.Stderr, "\033[1A\033[K") // not 1K as you'd think
 98 | 	}
 99 | 	fmt.Fprint(os.Stderr, obj.Prefix+s) // actually print
100 | 
101 | 	obj.previous = s // save for later
102 | 	obj.mutex.Unlock()
103 | }
104 | 


--------------------------------------------------------------------------------
/util/errwrap/errwrap.go:
--------------------------------------------------------------------------------
 1 | // Mgmt
 2 | // Copyright (C) 2013-2021+ James Shubin and the project contributors
 3 | // Written by James Shubin <james@shubin.ca> and the project contributors
 4 | //
 5 | // This program is free software: you can redistribute it and/or modify
 6 | // it under the terms of the GNU General Public License as published by
 7 | // the Free Software Foundation, either version 3 of the License, or
 8 | // (at your option) any later version.
 9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | // NOTE: This was copied from https://github.com/purpleidea/mgmt/ but the author
19 | // has allowed it to be distributed as LGPL-3.0+ for easier use in this project.
20 | // SPDX-License-Identifier: LGPL-3.0-linking-exception
21 | 
22 | // Package errwrap contains some error helpers.
23 | package errwrap
24 | 
25 | import (
26 | 	"github.com/hashicorp/go-multierror"
27 | 	"github.com/pkg/errors"
28 | )
29 | 
30 | // Wrapf adds a new error onto an existing chain of errors. If the new error to
31 | // be added is nil, then the old error is returned unchanged.
32 | func Wrapf(err error, format string, args ...interface{}) error {
33 | 	return errors.Wrapf(err, format, args...)
34 | }
35 | 
36 | // Append can be used to safely append an error onto an existing one. If you
37 | // pass in a nil error to append, the existing error will be returned unchanged.
38 | // If the existing error is already nil, then the new error will be returned
39 | // unchanged. This makes it easy to use Append as a safe `reterr += err`, when
40 | // you don't know if either is nil or not.
41 | func Append(reterr, err error) error {
42 | 	if reterr == nil { // keep it simple, pass it through
43 | 		return err // which might even be nil
44 | 	}
45 | 	if err == nil { // no error, so don't do anything
46 | 		return reterr
47 | 	}
48 | 	// both are real errors
49 | 	return multierror.Append(reterr, err)
50 | }
51 | 
52 | // String returns a string representation of the error. In particular, if the
53 | // error is nil, it returns an empty string instead of panicing.
54 | func String(err error) string {
55 | 	if err == nil {
56 | 		return ""
57 | 	}
58 | 	return err.Error()
59 | }
60 | 
61 | // Cause returns the top-most error that we can print directly to the end-user.
62 | func Cause(err error) error {
63 | 	return errors.Cause(err)
64 | }
65 | 


--------------------------------------------------------------------------------
/util/licenses/licenses.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | // Package licenses provides some structures for handling and representing
 25 | // software licenses. It uses SPDX representations for part of it, because there
 26 | // doesn't seem to be a better alternative. It doesn't guarantee that it
 27 | // implements all of the SPDX spec. If there's an aspect which you think was
 28 | // mis-implemented or is missing, please let us know.
 29 | // XXX: Add a test to check if the license-list-data submodule is up-to-date!
 30 | package licenses
 31 | 
 32 | import (
 33 | 	"bytes"
 34 | 	"embed"
 35 | 	"encoding/json"
 36 | 	"fmt"
 37 | 	"strings"
 38 | 	"sync"
 39 | )
 40 | 
 41 | // licensesJson is populated automatically at build-time from the official spdx
 42 | // licenses.json file, which is linked into this repository as a git submodule.
 43 | //
 44 | //go:embed license-list-data/json/licenses.json
 45 | var licensesJSON []byte
 46 | 
 47 | //go:embed license-list-data/json/details/*.json
 48 | var licensesTextJSON embed.FS
 49 | 
 50 | //go:embed license-list-data/json/exceptions.json
 51 | var exceptionsJson []byte
 52 | 
 53 | //go:embed license-list-data/json/exceptions/*.json
 54 | var exceptionsTextJSON embed.FS
 55 | 
 56 | var (
 57 | 	once        sync.Once
 58 | 	LicenseList LicenseListSPDX // this gets populated during init()
 59 | )
 60 | 
 61 | func init() {
 62 | 	once.Do(decode)
 63 | }
 64 | 
 65 | // TODO: import the exceptions if we ever decide we want to look at those.
 66 | func decode() {
 67 | 	buffer := bytes.NewBuffer(licensesJSON)
 68 | 	decoder := json.NewDecoder(buffer)
 69 | 	if err := decoder.Decode(&LicenseList); err != nil {
 70 | 		panic(fmt.Sprintf("error decoding spdx license list: %+v", err))
 71 | 	}
 72 | 	if len(LicenseList.Licenses) == 0 {
 73 | 		panic(fmt.Sprintf("could not find any licenses to decode"))
 74 | 	}
 75 | 
 76 | 	// debug
 77 | 	//dirEntry, err := licensesTextJSON.ReadDir("license-list-data/json/details")
 78 | 	//if err != nil {
 79 | 	//	panic(fmt.Sprintf("error: %+v", err))
 80 | 	//}
 81 | 	//for _, x := range dirEntry {
 82 | 	//	fmt.Printf("Name: %+v\n", x.Name())
 83 | 	//}
 84 | 
 85 | 	for _, license := range LicenseList.Licenses {
 86 | 		//fmt.Printf("ID: %+v\n", license.LicenseID) // debug
 87 | 
 88 | 		f := "license-list-data/json/details/" + strings.TrimPrefix(license.Reference, "./")
 89 | 		data, err := licensesTextJSON.ReadFile(f)
 90 | 		if err != nil {
 91 | 			panic(fmt.Sprintf("error reading spdx license file: %s, error: %+v", f, err))
 92 | 		}
 93 | 		//fmt.Printf("Data: %s\n", string(data)) // debug
 94 | 		buffer := bytes.NewBuffer(data)
 95 | 		decoder := json.NewDecoder(buffer)
 96 | 
 97 | 		if err := decoder.Decode(&license); err != nil {
 98 | 			panic(fmt.Sprintf("error decoding spdx license text: %+v", err))
 99 | 		}
100 | 		//fmt.Printf("Text: %+v\n", license.Text) // debug
101 | 		if license.Text == "" {
102 | 			panic(fmt.Sprintf("could not find any license text for: %s", license.LicenseID))
103 | 		}
104 | 	}
105 | }
106 | 
107 | // LicenseListSPDX is modelled after the official SPDX licenses.json file.
108 | type LicenseListSPDX struct {
109 | 	Version string `json:"licenseListVersion"`
110 | 
111 | 	Licenses []*LicenseSPDX `json:"licenses"`
112 | }
113 | 
114 | // LicenseSPDX is modelled after the official SPDX license entries. It also
115 | // includes fields from the referenced fields, which include the full text.
116 | type LicenseSPDX struct {
117 | 	// Reference is a link to the full license .json file.
118 | 	Reference    string `json:"reference"`
119 | 	IsDeprecated bool   `json:"isDeprecatedLicenseId"`
120 | 	DetailsURL   string `json:"detailsUrl"`
121 | 	// ReferenceNumber is an index number for the license. I wouldn't
122 | 	// consider this to be stable over time.
123 | 	ReferenceNumber int64 `json:"referenceNumber"`
124 | 	// Name is a friendly name for the license.
125 | 	Name string `json:"name"`
126 | 	// LicenseID is the SPDX ID for the license.
127 | 	LicenseID     string   `json:"licenseId"`
128 | 	SeeAlso       []string `json:"seeAlso"`
129 | 	IsOSIApproved bool     `json:"isOsiApproved"`
130 | 
131 | 	//IsDeprecated bool `json:"isDeprecatedLicenseId"` // appears again
132 | 	IsFSFLibre bool   `json:"isFsfLibre"`
133 | 	Text       string `json:"licenseText"`
134 | }
135 | 
136 | // License is a representation of a license. It's better than a simple SPDX ID
137 | // as a string, because it allows us to store alternative representations to an
138 | // internal or different representation, as well as any other information that
139 | // we want to have associated here.
140 | type License struct {
141 | 	// SPDX is the well-known SPDX ID for the license.
142 | 	SPDX string
143 | 
144 | 	// Origin shows a different license provenance, and associated custom
145 | 	// name. It should probably be a "reverse-dns" style unique identifier.
146 | 	Origin string
147 | 	// Custom is a custom string that is a unique identifier for the license
148 | 	// in the aforementioned Origin namespace.
149 | 	Custom string
150 | }
151 | 
152 | // String returns a string representation of whatever license is specified.
153 | func (obj *License) String() string {
154 | 	if obj.Origin != "" && obj.Custom != "" {
155 | 		return fmt.Sprintf("%s(%s)", obj.Custom, obj.Origin)
156 | 	}
157 | 
158 | 	if obj.Origin == "" && obj.Custom != "" {
159 | 		return fmt.Sprintf("%s(unknown)", obj.Custom) // TODO: display this differently?
160 | 	}
161 | 
162 | 	// TODO: replace with a different short name if one exists
163 | 	return obj.SPDX
164 | }
165 | 
166 | // Validate returns an error if the license doesn't have a valid representation.
167 | // For example, if you express the license as an SPDX ID, this will validate
168 | // that it is among the known licenses.
169 | func (obj *License) Validate() error {
170 | 	if obj.SPDX != "" {
171 | 		// if an SPDX ID is specified, we validate based on it!
172 | 		_, err := ID(obj.SPDX)
173 | 		return err
174 | 	}
175 | 
176 | 	// valid, but from an unknown origin
177 | 	if obj.Origin != "" && obj.Custom != "" {
178 | 		return nil
179 | 	}
180 | 
181 | 	if obj.Origin == "" && obj.Custom != "" {
182 | 		return fmt.Errorf("unknown custom license: %s", obj.Custom)
183 | 	}
184 | 
185 | 	return fmt.Errorf("unknown license format")
186 | }
187 | 
188 | // Cmp compares two licenses and determines if they are identical.
189 | func (obj *License) Cmp(license *License) error {
190 | 	if obj.SPDX != license.SPDX {
191 | 		return fmt.Errorf("the SPDX field differs")
192 | 	}
193 | 	if obj.Origin != license.Origin {
194 | 		return fmt.Errorf("the Origin field differs")
195 | 	}
196 | 	if obj.Custom != license.Custom {
197 | 		return fmt.Errorf("the Custom field differs")
198 | 	}
199 | 
200 | 	return nil
201 | }
202 | 
203 | // ID looks up the license from the imported list. Do not modify the result as
204 | // it is the global database that everyone is using.
205 | func ID(spdx string) (*LicenseSPDX, error) {
206 | 	for _, license := range LicenseList.Licenses {
207 | 		if spdx == license.LicenseID {
208 | 			return license, nil
209 | 		}
210 | 	}
211 | 	return nil, fmt.Errorf("license ID (%s) not found", spdx)
212 | }
213 | 
214 | // StringToLicense takes an input string and returns a license struct. This can
215 | // handle both normal SPDX ID's and the origin strings in the `name(origin)`
216 | // format. It rarely returns an error unless you pass it an obviously fake
217 | // license identifier.
218 | // TODO: add some tests
219 | func StringToLicense(name string) (*License, error) {
220 | 	license := &License{
221 | 		SPDX: name,
222 | 	}
223 | 
224 | 	if err := license.Validate(); err == nil {
225 | 		return license, nil
226 | 	}
227 | 
228 | 	// assume this for now...
229 | 	license = &License{
230 | 		//SPDX: "",
231 | 		Origin: "", // unknown
232 | 		Custom: name,
233 | 	}
234 | 
235 | 	// parse the licenseName(origin) syntax
236 | 	ix := strings.Index(name, "(")
237 | 	if ix > -1 && strings.HasSuffix(name, ")") && (ix+1) < (len(name)-1) {
238 | 		license = &License{
239 | 			//SPDX: "",
240 | 			Origin: name[ix+1 : len(name)-1],
241 | 			Custom: name[0:ix],
242 | 		}
243 | 	}
244 | 
245 | 	lhs := strings.Count(name, "(")
246 | 	rhs := strings.Count(name, ")")
247 | 	if lhs != rhs {
248 | 		return nil, fmt.Errorf("unbalanced parenthesis")
249 | 	}
250 | 	if lhs != 0 && lhs != 1 {
251 | 		return nil, fmt.Errorf("invalid parenthesis count")
252 | 	}
253 | 
254 | 	return license, nil
255 | }
256 | 
257 | // StringsToLicenses converts a list of input strings and converts them into the
258 | // matching list of license structs. It accepts non-SPDX license names in the
259 | // standard SPDX format of `name(origin)`.
260 | func StringsToLicenses(inputs []string) ([]*License, error) {
261 | 	licenses := []*License{}
262 | 
263 | 	for _, x := range inputs {
264 | 		license, err := StringToLicense(x)
265 | 		if err != nil {
266 | 			return nil, err
267 | 		}
268 | 		licenses = append(licenses, license)
269 | 	}
270 | 
271 | 	return licenses, nil
272 | }
273 | 
274 | // Join joins the string representations of a list of licenses with comma space.
275 | func Join(licenses []*License) string {
276 | 	xs := []string{}
277 | 	for _, license := range licenses {
278 | 		xs = append(xs, license.String())
279 | 	}
280 | 	return strings.Join(xs, ", ")
281 | }
282 | 
283 | // InList returns true if a license exists inside a list, otherwise false. It
284 | // uses the license Cmp method to determine equality.
285 | func InList(needle *License, haystack []*License) bool {
286 | 	for _, x := range haystack {
287 | 		if needle.Cmp(x) == nil {
288 | 			return true
289 | 		}
290 | 	}
291 | 	return false
292 | }
293 | 
294 | // Union returns the union of licenses in both input lists. It uses the pointers
295 | // from the first list in the results. It does not try to remove duplicates so
296 | // if either list has duplicates, you may end up with duplicates in the result.
297 | // It uses the license Cmp method to determine equality.
298 | func Union(haystack1 []*License, haystack2 []*License) []*License {
299 | 	union := []*License{}
300 | 	for _, x := range haystack1 {
301 | 		if InList(x, haystack2) {
302 | 			union = append(union, x)
303 | 		}
304 | 	}
305 | 	return union
306 | }
307 | 


--------------------------------------------------------------------------------
/util/licenses/licenses_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
 2 | // Written by James Shubin <purple@amazon.com> and the project contributors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | // use this file except in compliance with the License. You may obtain a copy of
 6 | // the License at
 7 | //
 8 | // http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | // License for the specific language governing permissions and limitations under
14 | // the License.
15 | //
16 | // We will never require a CLA to submit a patch. All contributions follow the
17 | // `inbound == outbound` rule.
18 | //
19 | // This is not an official Amazon product. Amazon does not offer support for
20 | // this project.
21 | //
22 | // SPDX-License-Identifier: Apache-2.0
23 | 
24 | package licenses_test
25 | 
26 | import (
27 | 	"testing"
28 | 
29 | 	"github.com/awslabs/yesiscan/util/licenses"
30 | )
31 | 
32 | func TestValidate(t *testing.T) {
33 | 	license := licenses.License{
34 | 		SPDX: "AGPL-3.0-or-later",
35 | 	}
36 | 	if err := license.Validate(); err != nil {
37 | 		t.Errorf("err: %+v", err)
38 | 		return
39 | 	}
40 | }
41 | 
42 | func TestID(t *testing.T) {
43 | 	if _, err := licenses.ID("AGPL-3.0-or-later"); err != nil {
44 | 		t.Errorf("err: %+v", err)
45 | 		return
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/util/util.go:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com Inc or its affiliates and the project contributors
  2 | // Written by James Shubin <purple@amazon.com> and the project contributors
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 | // use this file except in compliance with the License. You may obtain a copy of
  6 | // the License at
  7 | //
  8 | // http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 | // License for the specific language governing permissions and limitations under
 14 | // the License.
 15 | //
 16 | // We will never require a CLA to submit a patch. All contributions follow the
 17 | // `inbound == outbound` rule.
 18 | //
 19 | // This is not an official Amazon product. Amazon does not offer support for
 20 | // this project.
 21 | //
 22 | // SPDX-License-Identifier: Apache-2.0
 23 | 
 24 | package util
 25 | 
 26 | import (
 27 | 	"fmt"
 28 | 	"net/url"
 29 | 	"sort"
 30 | 	"strings"
 31 | )
 32 | 
 33 | // StrInList returns true if a string exists inside a list, otherwise false.
 34 | func StrInList(needle string, haystack []string) bool {
 35 | 	for _, x := range haystack {
 36 | 		if needle == x {
 37 | 			return true
 38 | 		}
 39 | 	}
 40 | 	return false
 41 | }
 42 | 
 43 | // ShellHyperlinkEncode takes a string, and a uri and returns a shell encoded
 44 | // representation of a hyperlink using the modern shell escaping sequence. Idea
 45 | // from: https://purpleidea.com/blog/2018/06/29/hyperlinks-in-gnome-terminal/
 46 | func ShellHyperlinkEncode(display string, uri string) string {
 47 | 	x := uri // XXX: how do we escape correctly?
 48 | 	//x := url.QueryEscape(uri) // XXX: this is the wrong escaping
 49 | 
 50 | 	return "\033]8;;" + x + "\a" + display + "\033]8;;\a"
 51 | }
 52 | 
 53 | // HtmlHyperlinkEncode takes a string, and a uri and returns an html
 54 | // representation of a hyperlink using the normal anchor tags.
 55 | func HtmlHyperlinkEncode(display string, uri string) string {
 56 | 	return "<a href=" + uri + ">" + display + "</a>"
 57 | }
 58 | 
 59 | // SmartURI returns a "smart" URI given an internal UID that we have. The UID is
 60 | // the special string that's the unique identifier that's returned from each
 61 | // backend. We convert this into a "better" URI if we can. If we can't, we just
 62 | // return the uid unchanged.
 63 | // TODO: the different helper functions that are called within could be provided
 64 | // by each backend, instead of us writing them here and assuming how they work.
 65 | func SmartURI(uid string) string {
 66 | 	// is this a github URI?
 67 | 	if s, err := smartGithubURI(uid); err == nil {
 68 | 		return s
 69 | 	}
 70 | 
 71 | 	return uid
 72 | }
 73 | 
 74 | // smartGithubURI attempts to return a useful URI from an internal Github UID.
 75 | // If we don't detect this as a github UID, then we error.
 76 | func smartGithubURI(uid string) (string, error) {
 77 | 	u, err := url.Parse(uid)
 78 | 	if err != nil {
 79 | 		return "", err
 80 | 	}
 81 | 
 82 | 	if u.Scheme != "git" && u.Scheme != "https" {
 83 | 		return "", fmt.Errorf("invalid scheme")
 84 | 	}
 85 | 	u.Scheme = "https" // make it user clickable
 86 | 
 87 | 	if u.Host != "github.com" {
 88 | 		return "", fmt.Errorf("wrong hostname")
 89 | 	}
 90 | 
 91 | 	q := u.Query()
 92 | 	sha1s := q["sha1"]
 93 | 	if len(sha1s) != 1 {
 94 | 		return "", fmt.Errorf("wrong length of sha1s")
 95 | 	}
 96 | 	sha1 := sha1s[0]
 97 | 	if sha1 == "" {
 98 | 		return "", fmt.Errorf("unknown sha1")
 99 | 	}
100 | 	u.RawQuery = "" // erase it
101 | 
102 | 	p := strings.TrimPrefix(u.Path, "/")
103 | 	ps := strings.Split(p, "/")
104 | 	if len(ps) < 2 {
105 | 		return "", fmt.Errorf("invalid path")
106 | 	}
107 | 
108 | 	u.Path = ps[0] + "/" + ps[1] + "/blob/" + sha1 + "/" + strings.Join(ps[2:], "/")
109 | 
110 | 	u.RawPath = ""       // encoded path hint (see EscapedPath method)
111 | 	u.ForceQuery = false // append a query ('?') even if RawQuery is empty
112 | 
113 | 	// TODO: add support for line number ranges, eg: #L13-L42 or just #L42
114 | 
115 | 	u.Fragment = ""    // fragment for references, without '#'
116 | 	u.RawFragment = "" // encoded fragment hint (see EscapedFragment method)
117 | 
118 | 	return u.String(), nil
119 | }
120 | 
121 | // NamedArgsTemplate takes a format string that contains named args wrapped in
122 | // curly brackets, and templates them in. For example, "hello {name}!" will turn
123 | // into "hello world!" if you pass a map with "name" => "world" into it.
124 | func NamedArgsTemplate(format string, replacements map[string]interface{}) string {
125 | 	keys := []string{}
126 | 	for k := range replacements {
127 | 		keys = append(keys, k)
128 | 	}
129 | 	sort.Strings(keys)
130 | 
131 | 	args := []string{}
132 | 	for _, k := range keys {
133 | 		s1 := "{" + k + "}"
134 | 		args = append(args, s1)
135 | 		s2 := fmt.Sprint(replacements[k])
136 | 		args = append(args, s2)
137 | 	}
138 | 
139 | 	return strings.NewReplacer(args...).Replace(format)
140 | }
141 | 


--------------------------------------------------------------------------------
/web/static/4a90d9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/web/static/4a90d9.jpg


--------------------------------------------------------------------------------
/web/static/icons8-checkmark.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg"  viewBox="0 0 100 100" width="100px" height="100px"><path d="M38.5,92.899c-1.736,0-3.368-0.676-4.596-1.904l-30-30C2.676,59.769,2,58.136,2,56.399 s0.677-3.37,1.905-4.598l9.898-9.898C15.031,40.676,16.663,40,18.399,40s3.368,0.676,4.596,1.904L38.5,57.408l40.5-40.5 c1.228-1.228,2.86-1.904,4.596-1.904s3.368,0.676,4.596,1.904l9.899,9.899c1.227,1.227,1.904,2.859,1.904,4.596 s-0.677,3.37-1.905,4.598L43.096,90.996C41.868,92.223,40.236,92.899,38.5,92.899z" opacity=".35"/><path fill="#f2f2f2" d="M36.5,90.899c-1.736,0-3.368-0.676-4.596-1.904l-30-30C0.676,57.769,0,56.136,0,54.399 s0.677-3.37,1.905-4.598l9.898-9.898C13.031,38.676,14.663,38,16.399,38s3.368,0.676,4.596,1.904L36.5,55.408l40.5-40.5 c1.228-1.228,2.86-1.904,4.596-1.904s3.368,0.676,4.596,1.904l9.899,9.899c1.227,1.227,1.904,2.859,1.904,4.596 s-0.677,3.37-1.905,4.598L41.096,88.996C39.868,90.223,38.236,90.899,36.5,90.899z"/><polygon fill="#96c362" points="36.5,84.399 6.5,54.399 16.399,44.5 36.5,64.601 81.597,19.504 91.496,29.403"/><g><polygon fill="none" stroke="#40396e" stroke-linecap="round" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="3" points="36.5,84.399 6.5,54.399 16.399,44.5 36.5,64.601 81.597,19.504 91.496,29.403"/></g></svg>


--------------------------------------------------------------------------------
/web/static/icons8-search.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg"  viewBox="0 0 100 100" width="100px" height="100px"><path d="M82.047,94.126c-3.179-0.073-6.167-1.436-8.351-3.834L56.753,71.684 c-4.408,2.152-9.269,3.275-14.272,3.275c-8.755,0-16.949-3.373-23.073-9.498c-6.15-6.149-9.538-14.327-9.538-23.026 s3.388-16.877,9.539-23.027c6.149-6.15,14.327-9.538,23.026-9.538s16.877,3.388,23.027,9.539 c9.916,9.915,12.252,25.023,6.223,37.346l18.607,16.941c2.398,2.185,3.761,5.173,3.836,8.415c0.076,3.24-1.146,6.288-3.439,8.581 c-2.216,2.217-5.164,3.439-8.3,3.439C82.274,94.131,82.161,94.13,82.047,94.126z" opacity=".35"/><path fill="#f2f2f2" d="M80.047,92.126c-3.179-0.073-6.167-1.436-8.351-3.834L54.753,69.684 c-4.408,2.152-9.269,3.275-14.272,3.275c-8.755,0-16.949-3.373-23.073-9.498c-6.15-6.149-9.538-14.327-9.538-23.026 s3.388-16.877,9.539-23.027c6.149-6.15,14.327-9.538,23.026-9.538s16.877,3.388,23.027,9.539 c9.916,9.915,12.252,25.023,6.223,37.346l18.607,16.941c2.398,2.185,3.761,5.173,3.836,8.415c0.076,3.24-1.146,6.288-3.439,8.581 c-2.216,2.217-5.164,3.439-8.3,3.439C80.274,92.131,80.161,92.13,80.047,92.126z"/><path fill="#707cc0" d="M84.094,84.094L84.094,84.094c-2.118,2.118-5.576,2.037-7.592-0.178L47.855,52.451l4.596-4.596 l31.465,28.647C86.131,78.518,86.212,81.976,84.094,84.094z"/><circle cx="39.839" cy="39.839" r="25.47" fill="#70bfff"/><path fill="#40396e" d="M80.388,87.131c-0.054,0-0.108,0-0.163-0.002c-1.862-0.043-3.578-0.825-4.833-2.203l-19.64-21.572 c-10.798,7.225-25.532,5.852-34.81-3.428c-5.207-5.206-8.074-12.128-8.074-19.491s2.867-14.285,8.074-19.491 c5.206-5.207,12.128-8.074,19.492-8.074s14.286,2.867,19.492,8.074c9.278,9.277,10.651,24.011,3.427,34.809l21.572,19.641l0,0 c1.377,1.255,2.16,2.971,2.204,4.833s-0.658,3.612-1.975,4.929C83.876,86.433,82.189,87.131,80.388,87.131z M55.97,59.864 c0.409,0,0.815,0.167,1.11,0.49l20.532,22.552c0.696,0.765,1.65,1.2,2.684,1.224c1.054,0.042,2.006-0.365,2.738-1.097 s1.121-1.703,1.097-2.737s-0.459-1.988-1.224-2.685L60.354,57.079c-0.57-0.52-0.654-1.386-0.194-2.005 c7.213-9.693,6.201-23.454-2.355-32.01c-4.64-4.64-10.809-7.195-17.371-7.195s-12.73,2.556-17.371,7.195 c-4.64,4.64-7.195,10.809-7.195,17.37s2.555,12.73,7.195,17.37c8.556,8.555,22.318,9.568,32.011,2.356 C55.343,59.962,55.657,59.864,55.97,59.864z"/><path fill="#d9eeff" d="M20.369,41.935c-0.829,0-1.5-0.672-1.5-1.5c0-5.761,2.244-11.177,6.317-15.249 c1.485-1.485,3.163-2.739,4.987-3.729c0.729-0.395,1.639-0.124,2.033,0.604c0.395,0.728,0.125,1.639-0.604,2.033 c-1.569,0.852-3.015,1.933-4.295,3.213c-3.507,3.507-5.438,8.169-5.438,13.128C21.869,41.263,21.198,41.935,20.369,41.935z"/></svg>


--------------------------------------------------------------------------------