├── .github └── workflows │ ├── lint.yml │ ├── race.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FAILURES.md ├── LICENSE.md ├── Makefile ├── README.md ├── cmd └── license-detector │ ├── main.go │ └── main_test.go ├── go.mod ├── go.sum └── licensedb ├── analysis.go ├── api └── api.go ├── dataset.projects.gz ├── dataset.zip ├── dataset_test.go ├── filer ├── filer.go ├── filer_test.go └── test_data │ ├── git │ ├── COMMIT_EDITMSG │ ├── HEAD │ ├── config │ ├── description │ ├── info │ │ └── exclude │ ├── logs │ │ ├── HEAD │ │ └── refs │ │ │ └── heads │ │ │ └── master │ ├── objects │ │ ├── 33 │ │ │ └── 4a82b19a7c893d3807ea52ba35ff2170c296cc │ │ ├── 8a │ │ │ └── 9b00e3e4f0af606178510c19f40c5a77adb881 │ │ ├── cc │ │ │ └── 628ccd10742baea8241c5924df992b5c019f71 │ │ ├── ce │ │ │ └── 013625030ba8dba906f756967f9e9ca394464a │ │ └── f7 │ │ │ └── 922e986704f99de62ca715d4794324a32e9af2 │ └── refs │ │ └── heads │ │ └── master │ ├── local.zip │ └── local │ ├── one │ └── two │ └── three ├── internal ├── assets │ ├── bindata.go │ ├── extract_names.go │ └── extract_urls.go ├── db.go ├── fastlog │ ├── fastlog.go │ └── fastlog_test.go ├── investigation.go ├── nlp.go ├── nlp_test.go ├── normalize │ ├── normalize.go │ └── normalize_test.go ├── processors │ ├── html2text.go │ ├── html2text_test.go │ └── markup.go └── wmh │ ├── test_data │ └── wmh.bin │ ├── wmh.go │ └── wmh_test.go ├── licensedb.go ├── licensedb_test.go └── scan_file_content.go /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Lint 3 | jobs: 4 | lint: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/setup-go@v3 8 | with: 9 | go-version: '>=1.18.0' 10 | - uses: actions/checkout@v3 11 | - uses: golangci/golangci-lint-action@v3 12 | with: 13 | version: latest 14 | only-new-issues: true 15 | -------------------------------------------------------------------------------- /.github/workflows/race.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Race 3 | jobs: 4 | race: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: Install Go 8 | uses: actions/setup-go@v3 9 | with: 10 | go-version: '1.18' 11 | - name: Checkout code 12 | uses: actions/checkout@v2 13 | - name: Test race 14 | run: go test -v -race -timeout 60m ./... 15 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: 5 | - created 6 | 7 | jobs: 8 | release: 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | goos: [linux, darwin] 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v2 17 | 18 | - name: Release binaries 19 | uses: mcuadros/go-release-action@master 20 | with: 21 | project_path: cmd/license-detector 22 | binary_name: license-detector 23 | github_token: ${{ secrets.GITHUB_TOKEN }} 24 | goversion: '1.18' 25 | goos: ${{ matrix.goos }} 26 | goarch: amd64 27 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Test 3 | jobs: 4 | test: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | # Oldest supported and latest available. Anything in between should not 9 | # fail in ways that the newest does not also. 10 | go-version: ['1.18', '1.20'] 11 | platform: [ubuntu-latest, macos-latest] 12 | runs-on: ${{ matrix.platform }} 13 | steps: 14 | - name: Install Go 15 | uses: actions/setup-go@v3 16 | with: 17 | go-version: ${{ matrix.go-version }} 18 | 19 | - name: Checkout code 20 | uses: actions/checkout@v2 21 | 22 | - name: Test 23 | run: go test ./... 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | # Binaries for programs and plugins 4 | *.exe 5 | *.dll 6 | *.so 7 | *.dylib 8 | 9 | # Test binary, build with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | 15 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 16 | .glide/ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | education, socio-economic status, nationality, personal appearance, race, 10 | religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at conduct@sourced.tech. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | go-license-detector project is [Apache licensed](LICENSE.md) and accepts 4 | contributions via GitHub pull requests. This document outlines some of the 5 | conventions on development workflow, commit message formatting, contact points, 6 | and other resources to make it easier to get your contribution accepted. 7 | 8 | 9 | ## Support Channels 10 | 11 | The official support channels, for both users and contributors, are: 12 | 13 | - GitHub [issues](https://github.com/go-enry/go-license-detector/issues)* 14 | 15 | *Before opening a new issue or submitting a new pull request, it's helpful to 16 | search the project - it's likely that another user has already reported the 17 | issue you're facing, or it's a known issue that we're already aware of. 18 | 19 | 20 | ## How to Contribute 21 | 22 | Pull Requests (PRs) are the main and exclusive way to contribute to the official go-license-detector project. 23 | In order for a PR to be accepted it needs to pass a list of requirements: 24 | 25 | - All PRs must be written in idiomatic Go, formatted according to [goimports](https://godoc.org/golang.org/x/tools/cmd/goimports), and without any warnings from [go lint](https://github.com/golang/lint) nor [go vet](https://golang.org/cmd/vet/). 26 | - New features should be generally covered with tests. 27 | - The test suite must pass. 28 | - All PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS.md). 29 | 30 | ### Format of the commit message 31 | 32 | The commit summary must start with a capital letter and with a verb in present tense. No dot in the end. 33 | 34 | ``` 35 | Add a feature 36 | Remove unused code 37 | Fix a bug 38 | ``` 39 | 40 | Every commit details should describe what was changed, under which context and, if applicable, the GitHub issue it relates to. -------------------------------------------------------------------------------- /FAILURES.md: -------------------------------------------------------------------------------- 1 | # License detection failures 2 | 3 | This is a list of known license detection failures. Once there appear common patterns, go-license-detector 4 | will be extended to support those. 5 | 6 | ### Clear failures 7 | * [Microsoft/vscode](https://github.com/Microsoft/vscode/blob/master/LICENSE.txt) - license file seems legit, clear failure. 8 | * [adobe/brackets](https://github.com/adobe/brackets/blob/master/LICENSE) - license file seems legit, clear failure. 9 | * [pure-css/pure](https://github.com/pure-css/pure/blob/master/LICENSE.md) - BSD license not detected, clear failure. 10 | * [TryGhost/Ghost](https://github.com/TryGhost/Ghost/blob/master/LICENSE) - MIT license, clear failure. 11 | * [nwjs/nw.js](https://github.com/nwjs/nw.js/blob/nw28/LICENSE) - MIT, clear failure. 12 | * [jenkinsci/jenkins](https://github.com/jenkinsci/jenkins/blob/master/LICENSE.txt) - MIT license, clear failure. 13 | * [kahun/awesome-sysadmin](https://github.com/kahun/awesome-sysadmin/blob/master/LICENSE.txt) - CC-BY-SA 4.0, clear failure. 14 | * [AFNetworking/AFNetworking](https://github.com/AFNetworking/AFNetworking/blob/master/LICENSE) - MIT, clear failure. 15 | * [eslint/eslint](https://github.com/eslint/eslint/blob/master/LICENSE) - MIT, clear failure. 16 | * [CocoaLumberjack/CocoaLumberjack](https://github.com/CocoaLumberjack/CocoaLumberjack/blob/master/LICENSE.txt) - BSD, clear failure. 17 | * [linnovate/mean](https://github.com/linnovate/mean/blob/master/LICENSE) - MIT with broken lines, clear failure. 18 | * [bcit-ci/CodeIgniter](https://github.com/bcit-ci/CodeIgniter/blob/develop/license.txt) - MIT, clear failure. 19 | * [moment/moment](https://github.com/moment/moment/blob/develop/LICENSE) - MIT, clear failure. 20 | * [webpack/webpack](https://github.com/webpack/webpack/blob/master/LICENSE) - MIT, clear failure. 21 | * [Alamofire/Alamofire](https://github.com/Alamofire/Alamofire/blob/master/LICENSE) - MIT, clear failure. 22 | 23 | ### Custom file names 24 | * [atech/postal](https://github.com/atech/postal) - the file is `MIT-LICENCE`. 25 | * [rust-lang/rust](https://github.com/rust-lang/rust) - license files: `LICENSE-APACHE` and `LICENSE-MIT`; also the `README` tells about them. 26 | * [ariya/phantomjs](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) - the file is `LICENSE.BSD`. 27 | * [mpv-player/mpv](https://github.com/mpv-player/mpv) - `LICENSE.GPL` and `LICENSE.LGPL`. 28 | * [v8/v8](https://github.com/v8/v8) - `LICENSE.v8` and 4 other custom suffixes; `LICENSE` is a concatenation. 29 | * [philc/vimium](https://github.com/philc/vimium/blob/master/MIT-LICENSE.txt) - `MIT-LICENSE.txt`. 30 | * [torch/torch7](https://github.com/torch/torch7/blob/master/COPYRIGHT.txt) - `COPYRIGHT.txt` with custom header. 31 | * [mathiasbynens/dotfiles](https://github.com/mathiasbynens/dotfiles/blob/master/LICENSE-MIT.txt) - the file is `LICENSE-MIT.txt`. 32 | * [Marak/faker.js](https://github.com/Marak/faker.js/blob/master/MIT-LICENSE.txt) - file is `MIT-LICENSE.txt`, custom header. 33 | * [gionkunz/chartist-js](https://github.com/gionkunz/chartist-js/blob/develop/LICENSE-MIT) - the file is `LICENSE-MIT`. 34 | * [php-fig/fig-standards](https://github.com/php-fig/fig-standards) - the file names are `LICENSE-CC.md` and `LICENSE-MIT.md`. 35 | * [VundleVim/Vundle.vim](https://github.com/VundleVim/Vundle.vim/blob/master/LICENSE-MIT.txt) - the file name is `LICENSE-MIT.txt`. 36 | * [carhartl/jquery-cookie](https://github.com/carhartl/jquery-cookie/blob/master/MIT-LICENSE.txt) - the file name is `MIT-LICENSE.txt`. 37 | * [JetBrains/kotlin](https://github.com/JetBrains/kotlin/tree/master/license) - `license` directory, the standard file there points to Apache. 38 | 39 | ### Pointers 40 | * [akullpp/awesome-java](https://github.com/akullpp/awesome-java/blob/master/LICENSE.md) - file consists of the single reference to CC-BY-SA-4.0. 41 | * [Unitech/pm2](https://github.com/Unitech/pm2/blob/master/LICENSE) - the whole content is the name of the real license file. 42 | * [ruanyf/es6tutorial](https://github.com/ruanyf/es6tutorial/blob/gh-pages/LICENSE) - human-readable summary of CC-BY-NC 4.0. 43 | * [lukasz-madon/awesome-remote-job](https://github.com/lukasz-madon/awesome-remote-job) - `README` contains a link to CC0. 44 | * [sindresorhus/quick-look-plugins](https://github.com/sindresorhus/quick-look-plugins) - `README` points to CC0. 45 | * [MaximAbramchuck/awesome-interview-questions](https://github.com/MaximAbramchuck/awesome-interview-questions) - `README` states CC0. 46 | * [enaqx/awesome-react](https://github.com/enaqx/awesome-react#license) - `README` states CC0. 47 | * [sorrycc/awesome-javascript](https://github.com/sorrycc/awesome-javascript#license) - `README` points to CC0. 48 | * [vuejs/awesome-vue](https://github.com/vuejs/awesome-vue) - CC0 is appended to the end of `README`. 49 | * [terryum/awesome-deep-learning-papers](https://github.com/terryum/awesome-deep-learning-papers) - `README` points to CC0. 50 | * [gztchan/awesome-design](https://github.com/gztchan/awesome-design) - `README` points to CC0. 51 | * [sindresorhus/awesome-electron](https://github.com/sindresorhus/awesome-electron) - `README` points to CC0. 52 | * [donnemartin/system-design-primer](https://github.com/donnemartin/system-design-primer/blob/master/LICENSE.txt) - custom text, link to CC-BY 4.0. 53 | * [vinta/awesome-python](https://github.com/vinta/awesome-python/blob/master/LICENSE) - points to CC-BY 4.0. 54 | * [EbookFoundation/free-programming-books](https://github.com/EbookFoundation/free-programming-books/blob/master/LICENSE) - points to CC-BY 4.0. 55 | * [kamranahmedse/design-patterns-for-humans](https://github.com/kamranahmedse/design-patterns-for-humans#license) - `README` points to CC-BY 4.0. 56 | * [interagent/http-api-design](https://github.com/interagent/http-api-design/blob/master/LICENSE.md) - license file points to CC-BY 3.0. 57 | * [kamranahmedse/developer-roadmap](https://github.com/kamranahmedse/developer-roadmap) - `README` points to CC-BY 3.0. 58 | * [saltstack/salt](https://github.com/saltstack/salt/blob/develop/LICENSE) - custom text, link to Apache. 59 | * [inconshreveable/ngrok](https://github.com/inconshreveable/ngrok/blob/master/LICENSE) - points to Apache. 60 | * [google/python-fire](https://github.com/google/python-fire/blob/master/LICENSE) - points to Apache. 61 | * [swagger-api/swagger-ui](https://github.com/swagger-api/swagger-ui/blob/master/LICENSE) - points to Apache. 62 | * [SignalR/SignalR](https://github.com/SignalR/SignalR/blob/dev/LICENSE.txt) - points to Apache. 63 | * [facebookarchive/three20](https://github.com/facebookarchive/three20/blob/master/LICENSE) - points to Apache. 64 | * [atlassian/localstack](https://github.com/atlassian/localstack/blob/master/LICENSE.txt) - points to Apache. 65 | * [dmlc/xgboost](https://github.com/dmlc/xgboost/blob/master/LICENSE) - points to Apache. 66 | * [videojs/video.js](https://github.com/videojs/video.js/blob/master/LICENSE) - points to Apache. 67 | * [alibaba/druid](https://github.com/alibaba/druid/blob/master/license.txt) - points to Apache. 68 | * [astaxie/beego](https://github.com/astaxie/beego/blob/master/LICENSE) - points to Apache. 69 | * [requests/requests](https://github.com/requests/requests/blob/master/LICENSE) - points to Apache. 70 | * [alibaba/fastjson](https://github.com/alibaba/fastjson/blob/master/license.txt) - file points to Apache. 71 | * [Reactive-Extensions/RxJS](https://github.com/Reactive-Extensions/RxJS/blob/master/license.txt) - points to Apache. 72 | * [lord/slate](https://github.com/lord/slate/blob/master/LICENSE) - points to Apache. 73 | * [donnemartin/data-science-ipython-notebooks](https://github.com/donnemartin/data-science-ipython-notebooks/blob/master/LICENSE) - points to Apache. 74 | * [donnemartin/interactive-coding-challenges](https://github.com/donnemartin/interactive-coding-challenges/blob/master/LICENSE) - points to Apache. 75 | * [SFTtech/openage](https://github.com/SFTtech/openage) - `README` mentions GNU GPLv3. 76 | * [FFmpeg/FFmpeg](https://github.com/FFmpeg/FFmpeg/blob/master/LICENSE.md) - custom license text, points to LGPL and GPL. 77 | * [androidannotations/androidannotations](https://github.com/androidannotations/androidannotations/blob/develop/LICENSE.txt) - license points to Apache and CDDL. 78 | * [Theano/Theano](https://github.com/Theano/Theano/blob/master/LICENSE.txt) - symlink to `doc/LICENSE.txt`. 79 | * [mxgmn/WaveFunctionCollapse](https://github.com/mxgmn/WaveFunctionCollapse/blob/master/LICENSE.md) - custom text, points to MIT. 80 | * [marionettejs/backbone.marionette](https://github.com/marionettejs/backbone.marionette/blob/master/license.txt) - custom text, mentions MIT, links to http://mutedsolutions.mit-license.org/. 81 | * [date-fns/date-fns](https://github.com/date-fns/date-fns/blob/master/LICENSE.md) - custom text, points to http://kossnocorp.mit-license.org/. 82 | * [shadowsocks/shadowsocks-android](https://github.com/shadowsocks/shadowsocks-android/blob/master/LICENSE) - shortened GPL. 83 | * [mozilla/BrowserQuest](https://github.com/mozilla/BrowserQuest/blob/master/LICENSE) - file points to MPL and CC-BY-SA 3.0. 84 | 85 | ### Headers and footers 86 | * [Carthage/Carthage](https://github.com/Carthage/Carthage/blob/master/LICENSE.md) - extra content with copyright at the bottom of MIT license. 87 | * [serverless/serverless](https://github.com/serverless/serverless/blob/master/LICENSE.txt) - MIT with extra notice at the bottom. 88 | * [gitlabhq/gitlabhq](https://github.com/gitlabhq/gitlabhq/blob/master/LICENSE) - extra content at the end of MIT or BSD license. 89 | * [akveo/blur-admin](https://github.com/akveo/blur-admin/blob/master/LICENSE.txt) - MIT license with garbage in the end. 90 | * [keras-team/keras](https://github.com/keras-team/keras/blob/master/LICENSE) - MIT with many copyright notices in the beginning. 91 | * [meteor/meteor](https://github.com/meteor/meteor/blob/devel/LICENSE) - custom notice appended in the end of MIT license. 92 | * [celery/celery](https://github.com/celery/celery/blob/master/LICENSE) - BSD with custom header and footer. 93 | * [parse-community/parse-server](https://github.com/parse-community/parse-server/blob/master/LICENSE) - custom footer after BSD. 94 | * [ccgus/fmdb](https://github.com/ccgus/fmdb/blob/master/LICENSE.txt) - custom header before MIT. 95 | * [BVLC/caffe](https://github.com/BVLC/caffe/blob/master/LICENSE) - custom header and footer. 96 | * [JuliaLang/julia](https://github.com/JuliaLang/julia/blob/master/LICENSE.md) - custom header and huge footer with dependencies description. 97 | * [google/deepdream](https://github.com/google/deepdream/blob/master/LICENSE) - Apache with custom header. 98 | * [pytorch/pytorch](https://github.com/pytorch/pytorch/blob/master/LICENSE) - custom header before BSD. 99 | * [strongloop/loopback](https://github.com/strongloop/loopback/blob/master/LICENSE) - MIT with custom header. 100 | * [aosabook/500lines](https://github.com/aosabook/500lines/blob/master/LICENSE.md) - custom format, BSD in the end. 101 | * [janpaepke/ScrollMagic](https://github.com/janpaepke/ScrollMagic/blob/master/LICENSE.md) - MIT with custom header, mentions GPL. 102 | * [google/protobuf](https://github.com/google/protobuf/blob/master/LICENSE) - BSD with custom header. 103 | * [rapid7/metasploit-framework](https://github.com/rapid7/metasploit-framework/blob/master/COPYING) - BSD with custom footer. 104 | * [ipython/ipython](https://github.com/ipython/ipython/blob/master/COPYING.rst) - custom format, huge footer. 105 | * [jquery/jquery](https://github.com/jquery/jquery/blob/master/LICENSE.txt) - JQuery with custom footer (he-he). 106 | * [hapijs/hapi](https://github.com/hapijs/hapi/blob/master/LICENSE) - BSD with custom header and especially footer. 107 | * [sqlmapproject/sqlmap](https://github.com/sqlmapproject/sqlmap/blob/master/LICENSE) - GPL with custom header. 108 | 109 | ### Concatenations 110 | * [nodejs/node](https://github.com/nodejs/node/blob/master/LICENSE) - multiple licenses in the same file. 111 | * [chrissimpkins/Hack](https://github.com/source-foundry/Hack/blob/master/LICENSE.md) - multiple licenses in the same file. 112 | * [shadowsocks/shadowsocks-windows](https://github.com/shadowsocks/shadowsocks-windows/blob/master/LICENSE.txt) - multiple licenses concatenated together. 113 | * [Tencent/mars](https://github.com/Tencent/mars/blob/master/LICENSE) - many licenses concatenated, custom header. 114 | * [lodash/lodash](https://github.com/lodash/lodash/blob/master/LICENSE) - several licenses concatenated. 115 | * [libuv/libuv](https://github.com/libuv/libuv/blob/v1.x/LICENSE) - several licenses concatenated together. 116 | * [iview/iview](https://github.com/iview/iview/blob/2.0/LICENSE) - several MIT licenses concatenated together. 117 | * [fatih/vim-go](https://github.com/fatih/vim-go/blob/master/LICENSE) - two BSDs concatenated together, custom header, middle and bottom. 118 | * [bumptech/glide](https://github.com/bumptech/glide/blob/master/LICENSE) - concatenated licenses. 119 | * [sqlitebrowser/sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/blob/master/LICENSE) - GPL and MPL concatenated. 120 | * [Mantle/Mantle](https://github.com/Mantle/Mantle/blob/master/LICENSE.md) - two licenses concatenated. 121 | * [kripken/emscripten](https://github.com/kripken/emscripten/blob/incoming/LICENSE) - several licenses concatenated, custom header and footer. 122 | * [browserify/browserify](https://github.com/browserify/browserify/blob/master/LICENSE) - several licenses concatenated. 123 | * [aFarkas/html5shiv](https://github.com/aFarkas/html5shiv/blob/master/MIT%20and%20GPL2%20licenses.md) - the file is `MIT and GPL2 licenses.md` and it is a concatenation. 124 | * [Microsoft/CNTK](https://github.com/Microsoft/CNTK/blob/master/LICENSE.md) - custom format, concatenation. 125 | * [stedolan/jq](https://github.com/stedolan/jq/blob/master/COPYING) - concatenation of several licenses. 126 | * [jquery/jquery-ui](https://github.com/jquery/jquery-ui/blob/master/LICENSE.txt) - concatenation of JQuery and CC0. 127 | * [realm/realm-cocoa](https://github.com/realm/realm-cocoa/blob/master/LICENSE) - Apache concatenated with other content. 128 | * [robbiehanson/CocoaAsyncSocket](https://github.com/robbiehanson/CocoaAsyncSocket/blob/master/LICENSE.txt) - public domain and BSD, custom format. 129 | * [jquery/jquery-mobile](https://github.com/jquery/jquery-mobile/blob/master/LICENSE.txt) - concatenation of JQuery and CC0. 130 | 131 | ### Die hards 132 | * [RubaXa/Sortable](https://github.com/RubaXa/Sortable) - license is appended to the end of `README.md`. 133 | * [Automattic/mongoose](https://github.com/Automattic/mongoose) - BSD license is appended to the end of `README`. 134 | * [mperham/sidekiq](https://github.com/mperham/sidekiq/blob/master/LICENSE) - license file is completely custom, mentions LGPL. 135 | * [opencv/opencv](https://github.com/opencv/opencv/blob/master/LICENSE) - license is completely custom format but resembles a BSD. 136 | * [python/cpython](https://github.com/python/cpython/blob/master/LICENSE) - license is PSF 2. 137 | * [tmux/tmux](https://github.com/tmux/tmux) - custom `COPYING`, BSD is mentioned in `README`. 138 | * [facebook/nuclide](https://github.com/facebook/nuclide/blob/master/LICENSE) - custom license? resembles BSD. 139 | * [realm/realm-java](https://github.com/realm/realm-java/blob/master/LICENSE) - custom format, Apache concatenated with various notices. 140 | * [phanan/htaccess](https://github.com/phanan/htaccess/blob/master/LICENSE) - custom, says something about public domain and unlicense. 141 | * [cockroachdb/cockroach](https://github.com/cockroachdb/cockroach/blob/master/LICENSE) - custom text, mentions Apache, CCL, MIT and BSD, 142 | * [fbsamples/f8app](https://github.com/fbsamples/f8app/blob/master/LICENSE) - customized MIT. 143 | * [FortAwesome/Font-Awesome](https://github.com/FortAwesome/Font-Awesome/blob/master/LICENSE.txt) - custom format, mentions several licenses. 144 | * [Microsoft/api-guidelines](https://github.com/Microsoft/api-guidelines/blob/master/Guidelines.md#44-license) - pointer to CC-BY 4.0 in `Guidelines.md`. 145 | * [neovim/neovim](https://github.com/neovim/neovim/blob/master/LICENSE) - Apache with custom header and notes. 146 | * [moklick/frontend-stuff](https://github.com/moklick/frontend-stuff/blob/master/LICENSE) - summarized CC0. 147 | 148 | ### Miscellaneous 149 | * [IanLunn/Hover](https://github.com/IanLunn/Hover/blob/master/license.txt) - custom license, nothing can be done. 150 | * [xdissent/ievms](https://github.com/xdissent/ievms) - "none" license stated in the `README`. 151 | * [isocpp/CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/LICENSE) - custom license. 152 | * [Modernizr/Modernizr](https://github.com/Modernizr/Modernizr) - license file is a joke, `README` mentions MIT. 153 | * [froala/design-blocks](https://github.com/froala/design-blocks/blob/dev/LICENSE) - FROALA OPEN WEB DESIGN LICENSE. 154 | * [Swordfish90/cool-retro-term](https://github.com/Swordfish90/cool-retro-term) - `gpl-3.0.txt` and `gpl-2.0.txt` files. 155 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | ============== 3 | 4 | _Version 2.0, January 2004_ 5 | _<>_ 6 | 7 | ### Terms and Conditions for use, reproduction, and distribution 8 | 9 | #### 1. Definitions 10 | 11 | “License” shall mean the terms and conditions for use, reproduction, and 12 | distribution as defined by Sections 1 through 9 of this document. 13 | 14 | “Licensor” shall mean the copyright owner or entity authorized by the copyright 15 | owner that is granting the License. 16 | 17 | “Legal Entity” shall mean the union of the acting entity and all other entities 18 | that control, are controlled by, or are under common control with that entity. 19 | For the purposes of this definition, “control” means **(i)** the power, direct or 20 | indirect, to cause the direction or management of such entity, whether by 21 | contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the 22 | outstanding shares, or **(iii)** beneficial ownership of such entity. 23 | 24 | “You” (or “Your”) shall mean an individual or Legal Entity exercising 25 | permissions granted by this License. 26 | 27 | “Source” form shall mean the preferred form for making modifications, including 28 | but not limited to software source code, documentation source, and configuration 29 | files. 30 | 31 | “Object” form shall mean any form resulting from mechanical transformation or 32 | translation of a Source form, including but not limited to compiled object code, 33 | generated documentation, and conversions to other media types. 34 | 35 | “Work” shall mean the work of authorship, whether in Source or Object form, made 36 | available under the License, as indicated by a copyright notice that is included 37 | in or attached to the work (an example is provided in the Appendix below). 38 | 39 | “Derivative Works” shall mean any work, whether in Source or Object form, that 40 | is based on (or derived from) the Work and for which the editorial revisions, 41 | annotations, elaborations, or other modifications represent, as a whole, an 42 | original work of authorship. For the purposes of this License, Derivative Works 43 | shall not include works that remain separable from, or merely link (or bind by 44 | name) to the interfaces of, the Work and Derivative Works thereof. 45 | 46 | “Contribution” shall mean any work of authorship, including the original version 47 | of the Work and any modifications or additions to that Work or Derivative Works 48 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 49 | by the copyright owner or by an individual or Legal Entity authorized to submit 50 | on behalf of the copyright owner. For the purposes of this definition, 51 | “submitted” means any form of electronic, verbal, or written communication sent 52 | to the Licensor or its representatives, including but not limited to 53 | communication on electronic mailing lists, source code control systems, and 54 | issue tracking systems that are managed by, or on behalf of, the Licensor for 55 | the purpose of discussing and improving the Work, but excluding communication 56 | that is conspicuously marked or otherwise designated in writing by the copyright 57 | owner as “Not a Contribution.” 58 | 59 | “Contributor” shall mean Licensor and any individual or Legal Entity on behalf 60 | of whom a Contribution has been received by Licensor and subsequently 61 | incorporated within the Work. 62 | 63 | #### 2. Grant of Copyright License 64 | 65 | Subject to the terms and conditions of this License, each Contributor hereby 66 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 67 | irrevocable copyright license to reproduce, prepare Derivative Works of, 68 | publicly display, publicly perform, sublicense, and distribute the Work and such 69 | Derivative Works in Source or Object form. 70 | 71 | #### 3. Grant of Patent License 72 | 73 | Subject to the terms and conditions of this License, each Contributor hereby 74 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 75 | irrevocable (except as stated in this section) patent license to make, have 76 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 77 | such license applies only to those patent claims licensable by such Contributor 78 | that are necessarily infringed by their Contribution(s) alone or by combination 79 | of their Contribution(s) with the Work to which such Contribution(s) was 80 | submitted. If You institute patent litigation against any entity (including a 81 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 82 | Contribution incorporated within the Work constitutes direct or contributory 83 | patent infringement, then any patent licenses granted to You under this License 84 | for that Work shall terminate as of the date such litigation is filed. 85 | 86 | #### 4. Redistribution 87 | 88 | You may reproduce and distribute copies of the Work or Derivative Works thereof 89 | in any medium, with or without modifications, and in Source or Object form, 90 | provided that You meet the following conditions: 91 | 92 | * **(a)** You must give any other recipients of the Work or Derivative Works a copy of 93 | this License; and 94 | * **(b)** You must cause any modified files to carry prominent notices stating that You 95 | changed the files; and 96 | * **(c)** You must retain, in the Source form of any Derivative Works that You distribute, 97 | all copyright, patent, trademark, and attribution notices from the Source form 98 | of the Work, excluding those notices that do not pertain to any part of the 99 | Derivative Works; and 100 | * **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any 101 | Derivative Works that You distribute must include a readable copy of the 102 | attribution notices contained within such NOTICE file, excluding those notices 103 | that do not pertain to any part of the Derivative Works, in at least one of the 104 | following places: within a NOTICE text file distributed as part of the 105 | Derivative Works; within the Source form or documentation, if provided along 106 | with the Derivative Works; or, within a display generated by the Derivative 107 | Works, if and wherever such third-party notices normally appear. The contents of 108 | the NOTICE file are for informational purposes only and do not modify the 109 | License. You may add Your own attribution notices within Derivative Works that 110 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 111 | provided that such additional attribution notices cannot be construed as 112 | modifying the License. 113 | 114 | You may add Your own copyright statement to Your modifications and may provide 115 | additional or different license terms and conditions for use, reproduction, or 116 | distribution of Your modifications, or for any such Derivative Works as a whole, 117 | provided Your use, reproduction, and distribution of the Work otherwise complies 118 | with the conditions stated in this License. 119 | 120 | #### 5. Submission of Contributions 121 | 122 | Unless You explicitly state otherwise, any Contribution intentionally submitted 123 | for inclusion in the Work by You to the Licensor shall be under the terms and 124 | conditions of this License, without any additional terms or conditions. 125 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 126 | any separate license agreement you may have executed with Licensor regarding 127 | such Contributions. 128 | 129 | #### 6. Trademarks 130 | 131 | This License does not grant permission to use the trade names, trademarks, 132 | service marks, or product names of the Licensor, except as required for 133 | reasonable and customary use in describing the origin of the Work and 134 | reproducing the content of the NOTICE file. 135 | 136 | #### 7. Disclaimer of Warranty 137 | 138 | Unless required by applicable law or agreed to in writing, Licensor provides the 139 | Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, 140 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 141 | including, without limitation, any warranties or conditions of TITLE, 142 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 143 | solely responsible for determining the appropriateness of using or 144 | redistributing the Work and assume any risks associated with Your exercise of 145 | permissions under this License. 146 | 147 | #### 8. Limitation of Liability 148 | 149 | In no event and under no legal theory, whether in tort (including negligence), 150 | contract, or otherwise, unless required by applicable law (such as deliberate 151 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 152 | liable to You for damages, including any direct, indirect, special, incidental, 153 | or consequential damages of any character arising as a result of this License or 154 | out of the use or inability to use the Work (including but not limited to 155 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 156 | any and all other commercial damages or losses), even if such Contributor has 157 | been advised of the possibility of such damages. 158 | 159 | #### 9. Accepting Warranty or Additional Liability 160 | 161 | While redistributing the Work or Derivative Works thereof, You may choose to 162 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 163 | other liability obligations and/or rights consistent with this License. However, 164 | in accepting such obligations, You may act only on Your own behalf and on Your 165 | sole responsibility, not on behalf of any other Contributor, and only if You 166 | agree to indemnify, defend, and hold each Contributor harmless for any liability 167 | incurred by, or claims asserted against, such Contributor by reason of your 168 | accepting any such warranty or additional liability. 169 | 170 | _END OF TERMS AND CONDITIONS_ 171 | 172 | ### APPENDIX: How to apply the Apache License to your work 173 | 174 | To apply the Apache License to your work, attach the following boilerplate 175 | notice, with the fields enclosed by brackets `[]` replaced with your own 176 | identifying information. (Don't include the brackets!) The text should be 177 | enclosed in the appropriate comment syntax for the file format. We also 178 | recommend that a file or class name and description of purpose be included on 179 | the same “printed page” as the copyright notice for easier identification within 180 | third-party archives. 181 | 182 | Copyright [yyyy] [name of copyright owner] 183 | 184 | Licensed under the Apache License, Version 2.0 (the "License"); 185 | you may not use this file except in compliance with the License. 186 | You may obtain a copy of the License at 187 | 188 | http://www.apache.org/licenses/LICENSE-2.0 189 | 190 | Unless required by applicable law or agreed to in writing, software 191 | distributed under the License is distributed on an "AS IS" BASIS, 192 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 193 | See the License for the specific language governing permissions and 194 | limitations under the License. 195 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GOPATH ?= $(shell go env GOPATH) 2 | SPDX_DATA_VERSION ?= 3.17 3 | 4 | licensedb/internal/assets/bindata.go: licenses.tar urls.csv names.csv $(GOPATH)/bin/go-bindata 5 | rm -rf license-list-data-$(SPDX_DATA_VERSION) 6 | rm -f license-list-data.tar.gz 7 | $(GOPATH)/bin/go-bindata -nometadata -pkg assets -o licensedb/internal/assets/bindata.go licenses.tar urls.csv names.csv 8 | rm licenses.tar urls.csv names.csv 9 | 10 | licenses.tar: license-list-data.tar.gz 11 | tar -xf license-list-data.tar.gz license-list-data-$(SPDX_DATA_VERSION)/text 12 | tar -cf licenses.tar -C license-list-data-$(SPDX_DATA_VERSION)/text . 13 | 14 | license-list-data-$(SPDX_DATA_VERSION)/json/details: license-list-data.tar.gz 15 | tar -xf license-list-data.tar.gz license-list-data-$(SPDX_DATA_VERSION)/json/details 16 | 17 | urls.csv: license-list-data-$(SPDX_DATA_VERSION)/json/details 18 | go run licensedb/internal/assets/extract_urls.go license-list-data-$(SPDX_DATA_VERSION)/json/details > urls.csv 19 | 20 | names.csv: license-list-data-$(SPDX_DATA_VERSION)/json/details 21 | go run licensedb/internal/assets/extract_names.go license-list-data-$(SPDX_DATA_VERSION)/json/details > names.csv 22 | 23 | license-list-data.tar.gz: 24 | curl -SLk -o license-list-data.tar.gz https://github.com/spdx/license-list-data/archive/v$(SPDX_DATA_VERSION).tar.gz 25 | 26 | $(GOPATH)/bin/go-bindata: 27 | go get -v github.com/jteeuwen/go-bindata/go-bindata@6025e8de665b31fa74ab1a66f2cddd8c0abf887e 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-license-detector [![GoDoc](https://godoc.org/github.com/go-enry/go-license-detector/v4?status.svg)](https://pkg.go.dev/github.com/go-enry/go-license-detector/v4@v4.0.0/licensedb) [![Test](https://github.com/go-enry/go-license-detector/workflows/Test/badge.svg)](https://github.com/go-enry/go-license-detector/actions) [![Go Report Card](https://goreportcard.com/badge/github.com/go-enry/go-license-detector)](https://goreportcard.com/badge/github.com/go-enry/go-license-detector) 2 | 3 | Project license detector - a command line application and a library, written in Go. 4 | It scans the given directory for license files, normalizes and hashes them and outputs 5 | all the fuzzy matches with the list of reference texts. 6 | The returned names follow [SPDX](https://spdx.org/licenses/) standard. 7 | Read the [blog post](https://blog.sourced.tech/post/gld/). 8 | 9 | Why? There are no similar projects which can be compiled into a native binary without 10 | dependencies and also support the whole SPDX license database (≈400 items). 11 | This implementation is also fast, requires little memory, and the API is easy to use. 12 | 13 | The license texts are taken directly from [license-list-data](https://github.com/spdx/license-list-data) 14 | repository. The detection algorithm is **not template matching**; 15 | this directly implies that go-license-detector does not provide any legal guarantees. 16 | The intended area of it's usage is data mining. 17 | 18 | ## Installation 19 | 20 | ``` 21 | go get github.com/go-enry/go-license-detector/v4/licensedb 22 | ``` 23 | 24 | The CLI is available for download at the [release](https://github.com/go-enry/go-license-detector/releases/latest) page. 25 | 26 | ## Algorithm 27 | 28 | 1. Find files in the root directory which may represent a license. E.g. `LICENSE` or `license.md`. 29 | 2. If the file is Markdown or reStructuredText, render to HTML and then convert to plain text. Original HTML files are also converted. 30 | 3. Normalize the text according to [SPDX recommendations](https://spdx.org/spdx-license-list/matching-guidelines). 31 | 4. Split the text into unigrams and build the weighted bag of words. 32 | 5. Calculate [Weighted MinHash](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf). 33 | 6. Apply Locality Sensitive Hashing and pick the reference licenses which are close. 34 | 7. For each of the candidate, calculate the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) - `D`. 35 | the corresponding text is the single line with each unigram represented by a single rune (character). 36 | 8. Set the similarity as `1 - D / L` where `L` is the number of unigrams in the quieried license. 37 | 38 | This pipeline guarantees constant time queries, though requires some initialization to preprocess 39 | the reference licenses. 40 | 41 | If there are not license files found: 42 | 43 | 1. Look for README files. 44 | 2. If the file is Markdown or reStructuredText, render to HTML and then convert to plain text. Original HTML files are also converted. 45 | 3. Scan for words like "copyright", "license" and "released under". Take the neighborhood. 46 | 4. Run Named Entity Recognition (NER) over that surrounding context and extract the possible license name. 47 | 5. Match it against the list of license names from SPDX. 48 | 49 | ## Usage 50 | 51 | Command line: 52 | 53 | ```bash 54 | license-detector /path/to/project 55 | license-detector https://github.com/go-git/go-git 56 | ``` 57 | 58 | Library (for a single license detection): 59 | 60 | ```go 61 | import ( 62 |    "github.com/go-enry/go-license-detector/v4/licensedb" 63 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 64 | ) 65 | 66 | func main() { 67 | licenses, err := licensedb.Detect(filer.FromDirectory("/path/to/project")) 68 | } 69 | ``` 70 | 71 | Library (for a convenient data structure that can be formatted as JSON): 72 | 73 | ```go 74 | import ( 75 | "encoding/json" 76 | "fmt" 77 | 78 | "github.com/go-enry/go-license-detector/v4/licensedb" 79 | ) 80 | 81 | func main() { 82 | results := licensedb.Analyse("/path/to/project1", "/path/to/project2") 83 | bytes, err := json.MarshalIndent(results, "", "\t") 84 | if err != nil { 85 | fmt.Printf("could not encode result to JSON: %v\n", err) 86 | } 87 | fmt.Println(string(bytes)) 88 | } 89 | ``` 90 | 91 | 92 | ## Quality 93 | 94 | On the [dataset](licensedb/dataset.zip) of ~1000 most starred repositories on GitHub as of early February 2018 95 | ([list](licensedb/dataset.projects.gz)), **99%** of the licenses are detected. 96 | The analysis of detection failures is going in [FAILURES.md](FAILURES.md). 97 | 98 | Comparison to other projects on that dataset: 99 | 100 | |Detector|Detection rate|Time to scan, sec| 101 | |:-------|:----------------------------------------:|:-----------------------------------------| 102 | |[go-license-detector](https://github.com/go-enry/go-license-detector)| 99% (897/902) | 13.5 | 103 | |[benbalter/licensee](https://github.com/benbalter/licensee)| 75% (673/902) | 111 | 104 | |[google/licenseclassifier](https://github.com/google/licenseclassifier)| 76% (682/902) | 907 | 105 | |[boyter/lc](https://github.com/boyter/lc)| 88% (797/902) | 548 | 106 | |[amzn/askalono](https://github.com/amzn/askalono)| 87% (785/902) | 165 | 107 | |[LiD](https://source.codeaurora.org/external/qostg/lid)| 94% (847/902) | 3660 | 108 | 109 |
How this was measured 110 |
$ cd $(go env GOPATH)/src/github.com/go-enry/go-license-detector/v4/licensedb
111 | $ mkdir dataset && cd dataset
112 | $ unzip ../dataset.zip
113 | $ # go-enry/go-license-detector
114 | $ time license-detector * \
115 |   | grep -Pzo '\n[-0-9a-zA-Z]+\n\tno license' | grep -Pa '\tno ' | wc -l
116 | $ # benbalter/licensee
117 | $ time ls -1 | xargs -n1 -P4 licensee \
118 |   | grep -E "^License: Other" | wc -l
119 | $ # google/licenseclassifier
120 | $ time find -type f -print | xargs -n1 -P4 identify_license \
121 |   | cut -d/ -f2 | sort | uniq | wc -l
122 | $ # boyter/lc
123 | $ time lc . \
124 |   | grep -vE 'NOASSERTION|----|Directory' | cut -d" " -f1 | sort | uniq | wc -l
125 | $ # amzn/askalono
126 | $ echo '#!/bin/sh
127 | result=$(askalono id "$1")
128 | echo "$1
129 | $result"' > ../askalono.wrapper
130 | $ time find -type f -print | xargs -n1 -P4 sh ../askalono.wrapper | grep -Pzo '.*\nLicense: .*\n' askalono.txt | grep -av "License: " | cut -d/ -f 2 | sort | uniq | wc -l
131 | $ # LiD
132 | $ time license-identifier -I dataset -F csv -O lid
133 | $ cat lid_*.csv | cut -d, -f1 | cut -d"'" -f 2 | grep / | cut -d/ -f2 | sort | uniq | wc -l
134 | 
135 |
136 | 137 | ## Regenerate binary data 138 | 139 | The SPDX licenses are included into the binary. To update them, run 140 | ``` 141 | # go install github.com/go-bindata/go-bindata/... 142 | make licensedb/internal/assets/bindata.go 143 | ``` 144 | 145 | ## Contributions 146 | 147 | ...are welcome, see [CONTRIBUTING.md](CONTRIBUTING.md) and [code of conduct](CODE_OF_CONDUCT.md). 148 | 149 | ## License 150 | 151 | Apache 2.0, see [LICENSE.md](LICENSE.md). 152 | -------------------------------------------------------------------------------- /cmd/license-detector/main.go: -------------------------------------------------------------------------------- 1 | // license-detector prints the most probable licenses for a repository 2 | // given either its path in the local file system or a URL pointing to 3 | // the repository. 4 | package main 5 | 6 | import ( 7 | "encoding/json" 8 | "fmt" 9 | "io" 10 | "log" 11 | "os" 12 | 13 | "github.com/go-enry/go-license-detector/v4/licensedb" 14 | "github.com/spf13/pflag" 15 | ) 16 | 17 | func main() { 18 | format := pflag.StringP("format", "f", "text", "Output format: json, text") 19 | pflag.Usage = func() { 20 | fmt.Fprintln(os.Stderr, "Usage: license-detector path ...") 21 | pflag.PrintDefaults() 22 | } 23 | pflag.Parse() 24 | if (*format != "json" && *format != "text") || pflag.NArg() == 0 { 25 | pflag.Usage() 26 | os.Exit(1) 27 | } 28 | detect(pflag.Args(), *format, os.Stdout) 29 | } 30 | 31 | // detect runs license analysis on each item in `args`` and outputs 32 | // the results in the specified `format` to `writer`. 33 | func detect(args []string, format string, writer io.Writer) { 34 | results := licensedb.Analyse(args...) 35 | 36 | switch format { 37 | case "text": 38 | for _, res := range results { 39 | fmt.Fprintln(writer, res.Arg) 40 | if res.ErrStr != "" { 41 | fmt.Fprintf(writer, "\t%v\n", res.ErrStr) 42 | continue 43 | } 44 | for _, m := range res.Matches { 45 | fmt.Fprintf(writer, "\t%1.f%%\t%s\n", 100*m.Confidence, m.License) 46 | } 47 | } 48 | case "json": 49 | b, err := json.MarshalIndent(results, "", "\t") 50 | if err != nil { 51 | log.Fatalf("could not encode result to JSON: %v", err) 52 | } 53 | fmt.Fprintf(writer, "%s\n", b) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cmd/license-detector/main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "testing" 7 | 8 | "github.com/go-enry/go-license-detector/v4/licensedb" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestCmdMain(t *testing.T) { 13 | buffer := &bytes.Buffer{} 14 | detect([]string{"../..", "."}, "json", buffer) 15 | var r []licensedb.Result 16 | err := json.Unmarshal(buffer.Bytes(), &r) 17 | assert.NoError(t, err) 18 | assert.Len(t, r, 2) 19 | assert.Equal(t, "../..", r[0].Arg) 20 | assert.Equal(t, ".", r[1].Arg) 21 | assert.Len(t, r[0].Matches, 4) 22 | assert.Len(t, r[1].Matches, 0) 23 | assert.Equal(t, "", r[0].ErrStr) 24 | assert.Equal(t, "no license file was found", r[1].ErrStr) 25 | assert.Equal(t, "Apache-2.0", r[0].Matches[0].License) 26 | assert.InDelta(t, 0.9877, r[0].Matches[0].Confidence, 0.002) 27 | assert.Equal(t, "ECL-2.0", r[0].Matches[1].License) 28 | assert.InDelta(t, 0.9047, r[0].Matches[1].Confidence, 0.002) 29 | buffer.Reset() 30 | detect([]string{"../..", "."}, "text", buffer) 31 | assert.Equal(t, `../.. 32 | 99% Apache-2.0 33 | 90% ECL-2.0 34 | 81% SHL-0.51 35 | 81% SHL-0.5 36 | . 37 | no license file was found 38 | `, buffer.String()) 39 | } 40 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/go-enry/go-license-detector/v4 2 | 3 | go 1.18 4 | 5 | require ( 6 | github.com/ekzhu/minhash-lsh v0.0.0-20190924033628-faac2c6342f8 7 | github.com/go-git/go-git/v5 v5.4.2 8 | github.com/hhatto/gorst v0.0.0-20181029133204-ca9f730cac5b 9 | github.com/jdkato/prose v1.2.1 10 | github.com/pkg/errors v0.9.1 11 | github.com/russross/blackfriday/v2 v2.1.0 12 | github.com/sergi/go-diff v1.2.0 13 | github.com/spf13/pflag v1.0.5 14 | github.com/stretchr/testify v1.8.0 15 | golang.org/x/exp v0.0.0-20221006183845-316c7553db56 16 | golang.org/x/net v0.0.0-20221004154528-8021a29435af 17 | golang.org/x/text v0.3.7 18 | gonum.org/v1/gonum v0.8.2 19 | ) 20 | 21 | require ( 22 | github.com/Microsoft/go-winio v0.6.0 // indirect 23 | github.com/ProtonMail/go-crypto v0.0.0-20220930113650-c6815a8c17ad // indirect 24 | github.com/acomagu/bufpipe v1.0.3 // indirect 25 | github.com/cloudflare/circl v1.2.0 // indirect 26 | github.com/davecgh/go-spew v1.1.1 // indirect 27 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc // indirect 28 | github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076 // indirect 29 | github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2 // indirect 30 | github.com/emirpasic/gods v1.18.1 // indirect 31 | github.com/go-git/gcfg v1.5.0 // indirect 32 | github.com/go-git/go-billy/v5 v5.3.1 // indirect 33 | github.com/imdario/mergo v0.3.13 // indirect 34 | github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect 35 | github.com/kevinburke/ssh_config v1.2.0 // indirect 36 | github.com/mitchellh/go-homedir v1.1.0 // indirect 37 | github.com/montanaflynn/stats v0.6.6 // indirect 38 | github.com/pmezard/go-difflib v1.0.0 // indirect 39 | github.com/shogo82148/go-shuffle v1.0.1 // indirect 40 | github.com/xanzy/ssh-agent v0.3.2 // indirect 41 | golang.org/x/crypto v0.0.0-20221005025214-4161e89ecf1b // indirect 42 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect 43 | golang.org/x/sys v0.0.0-20220928140112-f11e5e49a4ec // indirect 44 | golang.org/x/tools v0.1.12 // indirect 45 | gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect 46 | gopkg.in/warnings.v0 v0.1.2 // indirect 47 | gopkg.in/yaml.v3 v3.0.1 // indirect 48 | ) 49 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= 2 | github.com/Microsoft/go-winio v0.4.16/go.mod h1:XB6nPKklQyQ7GC9LdcBEcBl8PF76WugXOPRXwdLnMv0= 3 | github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= 4 | github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= 5 | github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE= 6 | github.com/ProtonMail/go-crypto v0.0.0-20210428141323-04723f9f07d7/go.mod h1:z4/9nQmJSSwwds7ejkxaJwO37dru3geImFUdJlaLzQo= 7 | github.com/ProtonMail/go-crypto v0.0.0-20220930113650-c6815a8c17ad h1:QeeqI2zxxgZVe11UrYFXXx6gVxPVF40ygekjBzEg4XY= 8 | github.com/ProtonMail/go-crypto v0.0.0-20220930113650-c6815a8c17ad/go.mod h1:UBYPn8k0D56RtnR8RFQMjmh4KrZzWJ5o7Z9SYjossQ8= 9 | github.com/acomagu/bufpipe v1.0.3 h1:fxAGrHZTgQ9w5QqVItgzwj235/uYZYgbXitB+dLupOk= 10 | github.com/acomagu/bufpipe v1.0.3/go.mod h1:mxdxdup/WdsKVreO5GpW4+M/1CE2sMG4jeGJ2sYmHc4= 11 | github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= 12 | github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= 13 | github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= 14 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= 15 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= 16 | github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= 17 | github.com/bwesterb/go-ristretto v1.2.1/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= 18 | github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= 19 | github.com/cloudflare/circl v1.2.0 h1:NheeISPSUcYftKlfrLuOo4T62FkmD4t4jviLfFFYaec= 20 | github.com/cloudflare/circl v1.2.0/go.mod h1:Ch2UgYr6ti2KTtlejELlROl0YIYj7SLjAC8M+INXlMk= 21 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 22 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 23 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 24 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 25 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc h1:8WFBn63wegobsYAX0YjD+8suexZDga5CctH4CCTx2+8= 26 | github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= 27 | github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076 h1:EB7M2v8Svo3kvIDy+P1YDE22XskDQP+TEYGzeDwPAN4= 28 | github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076/go.mod h1:VBi0XHpFy0xiMySf6YpVbRqrupW4RprJ5QTyN+XvGSM= 29 | github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2 h1:lx1ZQgST/imDhmLpYDma1O3Cx9L+4Ie4E8S2RjFPQ30= 30 | github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2/go.mod h1:hgHYKsoIw7S/hlWtP7wD1wZ7SX1jPTtKko5X9jrOgPQ= 31 | github.com/ekzhu/minhash-lsh v0.0.0-20190924033628-faac2c6342f8 h1:+Tje+xk1lmGKSJjYNtgCFsU1HtQzz0kCm1DFbKlvFBo= 32 | github.com/ekzhu/minhash-lsh v0.0.0-20190924033628-faac2c6342f8/go.mod h1:yEtCVi+QamvzjEH4U/m6ZGkALIkF2xfQnFp0BcKmIOk= 33 | github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= 34 | github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= 35 | github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= 36 | github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= 37 | github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= 38 | github.com/gliderlabs/ssh v0.2.2 h1:6zsha5zo/TWhRhwqCD3+EarCAgZ2yN28ipRnGPnwkI0= 39 | github.com/gliderlabs/ssh v0.2.2/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= 40 | github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4= 41 | github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E= 42 | github.com/go-git/go-billy/v5 v5.2.0/go.mod h1:pmpqyWchKfYfrkb/UVH4otLvyi/5gJlGI4Hb3ZqZ3W0= 43 | github.com/go-git/go-billy/v5 v5.3.1 h1:CPiOUAzKtMRvolEKw+bG1PLRpT7D3LIs3/3ey4Aiu34= 44 | github.com/go-git/go-billy/v5 v5.3.1/go.mod h1:pmpqyWchKfYfrkb/UVH4otLvyi/5gJlGI4Hb3ZqZ3W0= 45 | github.com/go-git/go-git-fixtures/v4 v4.2.1 h1:n9gGL1Ct/yIw+nfsfr8s4+sbhT+Ncu2SubfXjIWgci8= 46 | github.com/go-git/go-git-fixtures/v4 v4.2.1/go.mod h1:K8zd3kDUAykwTdDCr+I0per6Y6vMiRR/nnVTBtavnB0= 47 | github.com/go-git/go-git/v5 v5.4.2 h1:BXyZu9t0VkbiHtqrsvdq39UDhGJTl1h55VW6CSC4aY4= 48 | github.com/go-git/go-git/v5 v5.4.2/go.mod h1:gQ1kArt6d+n+BGd+/B/I74HwRTLhth2+zti4ihgckDc= 49 | github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= 50 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 51 | github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= 52 | github.com/hhatto/gorst v0.0.0-20181029133204-ca9f730cac5b h1:Jdu2tbAxkRouSILp2EbposIb8h4gO+2QuZEn3d9sKAc= 53 | github.com/hhatto/gorst v0.0.0-20181029133204-ca9f730cac5b/go.mod h1:HmaZGXHdSwQh1jnUlBGN2BeEYOHACLVGzYOXCbsLvxY= 54 | github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= 55 | github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= 56 | github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= 57 | github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= 58 | github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= 59 | github.com/jdkato/prose v1.2.1 h1:Fp3UnJmLVISmlc57BgKUzdjr0lOtjqTZicL3PaYy6cU= 60 | github.com/jdkato/prose v1.2.1/go.mod h1:AiRHgVagnEx2JbQRQowVBKjG0bcs/vtkGCH1dYAL1rA= 61 | github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4= 62 | github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= 63 | github.com/kevinburke/ssh_config v0.0.0-20201106050909-4977a11b4351/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= 64 | github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4= 65 | github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= 66 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 67 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 68 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= 69 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 70 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 71 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 72 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 73 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 74 | github.com/matryer/is v1.2.0 h1:92UTHpy8CDwaJ08GqLDzhhuixiBUUD1p3AU6PHddz4A= 75 | github.com/matryer/is v1.2.0/go.mod h1:2fLPjFQM9rhQ15aVEtbuwhJinnOqrmgXPNdZsdwlWXA= 76 | github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= 77 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= 78 | github.com/montanaflynn/stats v0.6.3/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= 79 | github.com/montanaflynn/stats v0.6.6 h1:Duep6KMIDpY4Yo11iFsvyqJDyfzLF9+sndUKT+v64GQ= 80 | github.com/montanaflynn/stats v0.6.6/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= 81 | github.com/neurosnap/sentences v1.0.6 h1:iBVUivNtlwGkYsJblWV8GGVFmXzZzak907Ci8aA0VTE= 82 | github.com/neurosnap/sentences v1.0.6/go.mod h1:pg1IapvYpWCJJm/Etxeh0+gtMf1rI1STY9S7eUCPbDc= 83 | github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= 84 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 85 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 86 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 87 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 88 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 89 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= 90 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 91 | github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= 92 | github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= 93 | github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= 94 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d/go.mod h1:2htx6lmL0NGLHlO8ZCf+lQBGBHIbEujyywxJArf+2Yc= 95 | github.com/shogo82148/go-shuffle v1.0.1 h1:4swIpHXLMAz14DE4YTgakgadpRN0n1wE1dieGnOTVFU= 96 | github.com/shogo82148/go-shuffle v1.0.1/go.mod h1:HQPjVgUUZ9TNgm4/K/iXRuAdhPsQrXnAGgtk/9kqbBY= 97 | github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= 98 | github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= 99 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 100 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 101 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 102 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 103 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 104 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 105 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 106 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 107 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 108 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 109 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= 110 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 111 | github.com/xanzy/ssh-agent v0.3.0/go.mod h1:3s9xbODqPuuhK9JV1R321M/FlMZSBvE5aY6eAcqrDh0= 112 | github.com/xanzy/ssh-agent v0.3.2 h1:eKj4SX2Fe7mui28ZgnFW5fmTz1EIr7ugo5s6wDxdHBM= 113 | github.com/xanzy/ssh-agent v0.3.2/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= 114 | golang.org/x/crypto v0.0.0-20190219172222-a4c6cb3142f2/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 115 | golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= 116 | golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= 117 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 118 | golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 119 | golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 120 | golang.org/x/crypto v0.0.0-20221005025214-4161e89ecf1b h1:huxqepDufQpLLIRXiVkTvnxrzJlpwmIWAObmcCcUFr0= 121 | golang.org/x/crypto v0.0.0-20221005025214-4161e89ecf1b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 122 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 123 | golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 124 | golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 125 | golang.org/x/exp v0.0.0-20221006183845-316c7553db56 h1:BrYbdKcCNjLyrN6aKqXy4hPw9qGI8IATkj4EWv9Q+kQ= 126 | golang.org/x/exp v0.0.0-20221006183845-316c7553db56/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE= 127 | golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= 128 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 h1:6zppjxzCulZykYSLyVDYbneBfbaBIQPYMevg0bEwv2s= 129 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 130 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 131 | golang.org/x/net v0.0.0-20210326060303-6b1517762897/go.mod h1:uSPa2vr4CLtc/ILN5odXGNXS6mhrKVzTaCXzk9m6W3k= 132 | golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 133 | golang.org/x/net v0.0.0-20221004154528-8021a29435af h1:wv66FM3rLZGPdxpYL+ApnDe2HzHcTFta3z5nsc13wI4= 134 | golang.org/x/net v0.0.0-20221004154528-8021a29435af/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= 135 | golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 136 | golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 137 | golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 138 | golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 139 | golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 140 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 141 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 142 | golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 143 | golang.org/x/sys v0.0.0-20210324051608-47abb6519492/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 144 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 145 | golang.org/x/sys v0.0.0-20210502180810-71e4cd670f79/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 146 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 147 | golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 148 | golang.org/x/sys v0.0.0-20220315194320-039c03cc5b86/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 149 | golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 150 | golang.org/x/sys v0.0.0-20220928140112-f11e5e49a4ec h1:BkDtF2Ih9xZ7le9ndzTA7KJow28VbQW3odyk/8drmuI= 151 | golang.org/x/sys v0.0.0-20220928140112-f11e5e49a4ec/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 152 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 153 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= 154 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 155 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 156 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= 157 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 158 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 159 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 160 | golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 161 | golang.org/x/tools v0.1.12 h1:VveCTK38A2rkS8ZqFY25HIDFscX5X9OoEhJd3quQmXU= 162 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 163 | gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= 164 | gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM= 165 | gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= 166 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= 167 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= 168 | gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= 169 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 170 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 171 | gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 172 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 173 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 174 | gopkg.in/neurosnap/sentences.v1 v1.0.6/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0= 175 | gopkg.in/neurosnap/sentences.v1 v1.0.7 h1:gpTUYnqthem4+o8kyTLiYIB05W+IvdQFYR29erfe8uU= 176 | gopkg.in/neurosnap/sentences.v1 v1.0.7/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0= 177 | gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= 178 | gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= 179 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 180 | gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 181 | gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 182 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 183 | gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 184 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 185 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 186 | rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= 187 | -------------------------------------------------------------------------------- /licensedb/analysis.go: -------------------------------------------------------------------------------- 1 | package licensedb 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "os" 7 | "sort" 8 | "sync" 9 | 10 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 11 | ) 12 | 13 | // Analyse runs license analysis on each item in `args` 14 | func Analyse(args ...string) []Result { 15 | nargs := len(args) 16 | results := make([]Result, nargs) 17 | var wg sync.WaitGroup 18 | wg.Add(nargs) 19 | for i, arg := range args { 20 | go func(i int, arg string) { 21 | defer wg.Done() 22 | matches, err := process(arg) 23 | res := Result{Arg: arg, Matches: matches} 24 | if err != nil { 25 | res.ErrStr = err.Error() 26 | } 27 | results[i] = res 28 | }(i, arg) 29 | } 30 | wg.Wait() 31 | 32 | return results 33 | } 34 | 35 | // Result gathers license detection results for a project path 36 | type Result struct { 37 | Arg string `json:"project,omitempty"` 38 | Matches []Match `json:"matches,omitempty"` 39 | ErrStr string `json:"error,omitempty"` 40 | } 41 | 42 | // Match describes the level of confidence for the detected License 43 | type Match struct { 44 | License string `json:"license"` 45 | Confidence float32 `json:"confidence"` 46 | File string `json:"file"` 47 | } 48 | 49 | func process(arg string) ([]Match, error) { 50 | newFiler := filer.FromDirectory 51 | if _, err := os.Stat(arg); err != nil { 52 | if !os.IsNotExist(err) { 53 | return nil, err 54 | } 55 | 56 | if _, err := url.Parse(arg); err == nil { 57 | newFiler = filer.FromGitURL 58 | } else { 59 | return nil, fmt.Errorf("arg should be a valid path or a URL") 60 | } 61 | } 62 | 63 | resolvedFiler, err := newFiler(arg) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | ls, err := Detect(resolvedFiler) 69 | if err != nil { 70 | return nil, err 71 | } 72 | 73 | var matches []Match 74 | for k, v := range ls { 75 | matches = append(matches, Match{k, v.Confidence, v.File}) 76 | } 77 | sort.Slice(matches, func(i, j int) bool { return matches[i].Confidence > matches[j].Confidence }) 78 | return matches, nil 79 | } 80 | -------------------------------------------------------------------------------- /licensedb/api/api.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | // Match is a detection result of a license with a confidence (0.0 - 1.0) 4 | // and a mapping of files to confidence. 5 | type Match struct { 6 | Files map[string]float32 7 | Confidence float32 8 | File string 9 | } 10 | -------------------------------------------------------------------------------- /licensedb/dataset.projects.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/dataset.projects.gz -------------------------------------------------------------------------------- /licensedb/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/dataset.zip -------------------------------------------------------------------------------- /licensedb/dataset_test.go: -------------------------------------------------------------------------------- 1 | package licensedb 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sync" 7 | "testing" 8 | 9 | "github.com/go-enry/go-license-detector/v4/licensedb/api" 10 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 11 | 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | func TestDataset(t *testing.T) { 16 | rootFiler, err := filer.FromZIP("dataset.zip") 17 | assert.Nil(t, err) 18 | defer rootFiler.Close() 19 | projects, err := rootFiler.ReadDir("") 20 | assert.Nil(t, err) 21 | licenses := map[string]map[string]api.Match{} 22 | mutex := sync.Mutex{} 23 | wg := sync.WaitGroup{} 24 | wg.Add(len(projects)) 25 | for _, project := range projects { 26 | go func(project filer.File) { 27 | defer wg.Done() 28 | myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name)) 29 | if len(myLicenses) > 0 { 30 | mutex.Lock() 31 | licenses[project.Name] = myLicenses 32 | mutex.Unlock() 33 | } 34 | }(project) 35 | } 36 | wg.Wait() 37 | assert.True(t, len(licenses) >= 893) 38 | // the rest len(projects) - 902 do not contain any license information 39 | fmt.Printf("%d %d %d%%\n", len(licenses), 902, (100*len(licenses))/902) 40 | if os.Getenv("LICENSE_TEST_DEBUG") != "" { 41 | for _, project := range projects { 42 | if _, exists := licenses[project.Name]; !exists { 43 | println(project.Name) 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /licensedb/filer/filer.go: -------------------------------------------------------------------------------- 1 | package filer 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "io/fs" 7 | "io/ioutil" 8 | "os" 9 | xpath "path" 10 | "path/filepath" 11 | "strings" 12 | 13 | git "github.com/go-git/go-git/v5" 14 | "github.com/go-git/go-git/v5/plumbing" 15 | "github.com/go-git/go-git/v5/plumbing/filemode" 16 | "github.com/go-git/go-git/v5/plumbing/object" 17 | "github.com/go-git/go-git/v5/storage/memory" 18 | "github.com/pkg/errors" 19 | ) 20 | 21 | // File represents a file in the virtual file system: every node is either a regular file 22 | // or a directory. Symlinks are dereferenced in the implementations. 23 | type File struct { 24 | Name string 25 | IsDir bool 26 | } 27 | 28 | // A Filer provides a list of files. 29 | type Filer interface { 30 | // ReadFile returns the contents of a file given it's path. 31 | ReadFile(path string) (content []byte, err error) 32 | // ReadDir lists a directory. 33 | ReadDir(path string) ([]File, error) 34 | // Close frees all the resources allocated by this Filer. 35 | Close() 36 | // PathsAreAlwaysSlash indicates whether the path separator is platform-independent ("/") or 37 | // OS-specific. 38 | PathsAreAlwaysSlash() bool 39 | } 40 | 41 | // FromDirectory returns a Filer that allows accessing over all the files contained in a directory. 42 | func FromDirectory(path string) (Filer, error) { 43 | fi, err := os.Stat(path) 44 | if err != nil { 45 | return nil, errors.Wrapf(err, "cannot create Filer from %s", path) 46 | } 47 | if !fi.IsDir() { 48 | return nil, errors.New("not a directory") 49 | } 50 | return FromFS(os.DirFS(path)), nil 51 | } 52 | 53 | // FromFS returns a Filer that allows accessing all files in the given file system. 54 | func FromFS(fsys fs.FS) Filer { 55 | return fsFiler{fsys} 56 | } 57 | 58 | type fsFiler struct{ fs fs.FS } 59 | 60 | func (fsys fsFiler) ReadFile(name string) ([]byte, error) { 61 | buf, err := fs.ReadFile(fsys.fs, name) 62 | if err != nil { 63 | return nil, errors.Wrapf(err, "cannot read file %s", name) 64 | } 65 | return buf, nil 66 | } 67 | 68 | func (fsys fsFiler) ReadDir(name string) ([]File, error) { 69 | if name == "" { 70 | name = "." 71 | } 72 | entries, err := fs.ReadDir(fsys.fs, name) 73 | if err != nil { 74 | return nil, errors.Wrapf(err, "cannot read directory %s", name) 75 | } 76 | files := make([]File, len(entries)) 77 | for i, e := range entries { 78 | files[i] = File{ 79 | Name: e.Name(), 80 | IsDir: e.IsDir(), 81 | } 82 | } 83 | return files, nil 84 | } 85 | 86 | func (fsys fsFiler) Close() {} 87 | 88 | func (fsys fsFiler) PathsAreAlwaysSlash() bool { 89 | return true 90 | } 91 | 92 | type gitFiler struct { 93 | root *object.Tree 94 | } 95 | 96 | // FromGitURL returns a Filer that allows to access all the files in a Git repository's default branch given its URL. 97 | // It keeps a shallow single-branch clone of the repository in memory. 98 | func FromGitURL(url string) (Filer, error) { 99 | repo, err := git.Clone(memory.NewStorage(), nil, &git.CloneOptions{URL: url, Depth: 1}) 100 | if err != nil { 101 | return nil, errors.Wrapf(err, "could not clone repo from %s", url) 102 | } 103 | return FromGit(repo, "") 104 | } 105 | 106 | // FromGit returns a Filer that allows accessing all the files in a Git repository 107 | func FromGit(repo *git.Repository, headRef plumbing.ReferenceName) (Filer, error) { 108 | var head *plumbing.Reference 109 | var err error 110 | if headRef == "" { 111 | head, err = repo.Head() 112 | } else { 113 | head, err = repo.Reference(headRef, true) 114 | } 115 | if err != nil { 116 | return nil, errors.Wrap(err, "could not fetch HEAD from repo") 117 | } 118 | commit, err := repo.CommitObject(head.Hash()) 119 | if err != nil { 120 | return nil, errors.Wrap(err, "could not fetch commit for HEAD") 121 | } 122 | tree, err := commit.Tree() 123 | if err != nil { 124 | return nil, errors.Wrap(err, "could not fetch root for HEAD commit") 125 | } 126 | return &gitFiler{root: tree}, nil 127 | } 128 | 129 | func (filer gitFiler) ReadFile(path string) ([]byte, error) { 130 | entry, err := filer.root.FindEntry(path) 131 | if err != nil { 132 | return nil, errors.Wrapf(err, "cannot find file %s", path) 133 | } 134 | if entry.Mode == filemode.Symlink { 135 | file, err := filer.root.File(path) 136 | if err != nil { 137 | return nil, errors.Wrapf(err, "cannot find file %s", path) 138 | } 139 | path, err = file.Contents() 140 | if err != nil { 141 | return nil, errors.Wrapf(err, "cannot read file %s", path) 142 | } 143 | } 144 | file, err := filer.root.File(path) 145 | if err != nil { 146 | return nil, errors.Wrapf(err, "cannot read file %s", path) 147 | } 148 | reader, err := file.Reader() 149 | if err != nil { 150 | return nil, errors.Wrapf(err, "cannot read file %s", path) 151 | } 152 | defer func() { err = reader.Close() }() 153 | 154 | buf := new(bytes.Buffer) 155 | if _, err = buf.ReadFrom(reader); err != nil { 156 | return nil, errors.Wrapf(err, "cannot read file %s", path) 157 | } 158 | return buf.Bytes(), err 159 | } 160 | 161 | func (filer *gitFiler) ReadDir(path string) ([]File, error) { 162 | var tree *object.Tree 163 | if path != "" { 164 | var err error 165 | tree, err = filer.root.Tree(path) 166 | if err != nil { 167 | return nil, errors.Wrapf(err, "cannot read directory %s", path) 168 | } 169 | } else { 170 | tree = filer.root 171 | } 172 | result := make([]File, 0, len(tree.Entries)) 173 | for _, entry := range tree.Entries { 174 | switch entry.Mode { 175 | case filemode.Dir: 176 | result = append(result, File{ 177 | Name: entry.Name, 178 | IsDir: true, 179 | }) 180 | case filemode.Regular, filemode.Executable, filemode.Deprecated, filemode.Symlink: 181 | result = append(result, File{ 182 | Name: entry.Name, 183 | IsDir: false, 184 | }) 185 | } 186 | } 187 | return result, nil 188 | } 189 | 190 | func (filer *gitFiler) Close() { 191 | filer.root = nil 192 | } 193 | 194 | func (filer *gitFiler) PathsAreAlwaysSlash() bool { 195 | return true 196 | } 197 | 198 | type zipNode struct { 199 | children map[string]*zipNode 200 | file *zip.File 201 | } 202 | 203 | type zipFiler struct { 204 | arch *zip.ReadCloser 205 | tree *zipNode 206 | } 207 | 208 | // FromZIP returns a Filer that allows accessing all the files in a ZIP archive given its path. 209 | func FromZIP(path string) (Filer, error) { 210 | arch, err := zip.OpenReader(path) 211 | if err != nil { 212 | return nil, errors.Wrapf(err, "cannot read ZIP archive %s", path) 213 | } 214 | root := &zipNode{children: map[string]*zipNode{}} 215 | for _, f := range arch.File { 216 | path := strings.Split(f.Name, "/") // zip always has "/" 217 | node := root 218 | for _, part := range path { 219 | if part == "" { 220 | continue 221 | } 222 | child := node.children[part] 223 | if child == nil { 224 | child = &zipNode{children: map[string]*zipNode{}} 225 | node.children[part] = child 226 | } 227 | node = child 228 | } 229 | node.file = f 230 | } 231 | return &zipFiler{arch: arch, tree: root}, nil 232 | } 233 | 234 | func (filer *zipFiler) ReadFile(path string) ([]byte, error) { 235 | parts := strings.Split(path, string("/")) 236 | node := filer.tree 237 | for _, part := range parts { 238 | if part == "" { 239 | continue 240 | } 241 | node = node.children[part] 242 | if node == nil { 243 | return nil, errors.Errorf("does not exist: %s", path) 244 | } 245 | } 246 | reader, err := node.file.Open() 247 | if err != nil { 248 | return nil, errors.Wrapf(err, "cannot open %s", path) 249 | } 250 | defer reader.Close() 251 | buffer, err := ioutil.ReadAll(reader) 252 | if err != nil { 253 | return nil, errors.Wrapf(err, "cannot read %s", path) 254 | } 255 | return buffer, nil 256 | } 257 | 258 | func (filer *zipFiler) ReadDir(path string) ([]File, error) { 259 | parts := strings.Split(path, string("/")) 260 | node := filer.tree 261 | for _, part := range parts { 262 | if part == "" { 263 | continue 264 | } 265 | node = node.children[part] 266 | if node == nil { 267 | return nil, errors.Errorf("does not exist: %s", path) 268 | } 269 | } 270 | if path != "" && !node.file.FileInfo().IsDir() { 271 | return nil, errors.Errorf("not a directory: %s", path) 272 | } 273 | result := make([]File, 0, len(node.children)) 274 | for name, child := range node.children { 275 | result = append(result, File{ 276 | Name: name, 277 | IsDir: child.file.FileInfo().IsDir(), 278 | }) 279 | } 280 | return result, nil 281 | } 282 | 283 | func (filer *zipFiler) Close() { 284 | filer.arch.Close() 285 | } 286 | 287 | func (filer *zipFiler) PathsAreAlwaysSlash() bool { 288 | return true 289 | } 290 | 291 | type nestedFiler struct { 292 | origin Filer 293 | offset string 294 | } 295 | 296 | // NestFiler wraps an existing Filer. It prepends the specified prefix to every path. 297 | func NestFiler(filer Filer, prefix string) Filer { 298 | return &nestedFiler{origin: filer, offset: prefix} 299 | } 300 | 301 | func (filer *nestedFiler) ReadFile(path string) ([]byte, error) { 302 | var fullPath string 303 | if filer.origin.PathsAreAlwaysSlash() { 304 | fullPath = xpath.Join(filer.offset, path) 305 | } else { 306 | fullPath = filepath.Join(filer.offset, path) 307 | } 308 | return filer.origin.ReadFile(fullPath) 309 | } 310 | 311 | func (filer *nestedFiler) ReadDir(path string) ([]File, error) { 312 | var fullPath string 313 | if filer.origin.PathsAreAlwaysSlash() { 314 | fullPath = xpath.Join(filer.offset, path) 315 | } else { 316 | fullPath = filepath.Join(filer.offset, path) 317 | } 318 | return filer.origin.ReadDir(fullPath) 319 | } 320 | 321 | func (filer *nestedFiler) Close() { 322 | filer.origin.Close() 323 | } 324 | 325 | func (filer *nestedFiler) PathsAreAlwaysSlash() bool { 326 | return filer.origin.PathsAreAlwaysSlash() 327 | } 328 | -------------------------------------------------------------------------------- /licensedb/filer/filer_test.go: -------------------------------------------------------------------------------- 1 | package filer 2 | 3 | import ( 4 | "sort" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func testFiler(t *testing.T, filer Filer) { 11 | defer filer.Close() 12 | files, err := filer.ReadDir("") 13 | sort.Slice(files, func(i int, j int) bool { 14 | return files[i].Name < files[j].Name 15 | }) 16 | assert.Nil(t, err) 17 | assert.Len(t, files, 2) 18 | assert.Equal(t, "one", files[0].Name) 19 | assert.False(t, files[0].IsDir) 20 | assert.Equal(t, "two", files[1].Name) 21 | assert.True(t, files[1].IsDir) 22 | content, err := filer.ReadFile("one") 23 | assert.Nil(t, err) 24 | assert.Equal(t, "hello\n", string(content)) 25 | files, err = filer.ReadDir("two") 26 | assert.Nil(t, err) 27 | assert.Len(t, files, 1) 28 | assert.Equal(t, "three", files[0].Name) 29 | assert.False(t, files[0].IsDir) 30 | content, err = filer.ReadFile("two/three") 31 | assert.Nil(t, err) 32 | assert.Equal(t, "world\n", string(content)) 33 | 34 | files, err = filer.ReadDir("..") 35 | assert.Nil(t, files) 36 | assert.NotNil(t, err) 37 | 38 | files, err = filer.ReadDir("two/three") 39 | assert.Nil(t, files) 40 | assert.NotNil(t, err) 41 | 42 | content, err = filer.ReadFile("two/four") 43 | assert.Nil(t, content) 44 | assert.NotNil(t, err) 45 | } 46 | 47 | func TestLocalFiler(t *testing.T) { 48 | filer, err := FromDirectory("test_data/local") 49 | assert.Nil(t, err) 50 | testFiler(t, filer) 51 | filer, err = FromDirectory("test_data/local2") 52 | assert.Nil(t, filer) 53 | assert.NotNil(t, err) 54 | filer, err = FromDirectory("test_data/local/one") 55 | assert.Nil(t, filer) 56 | assert.NotNil(t, err) 57 | } 58 | 59 | func TestGitFiler(t *testing.T) { 60 | filer, err := FromGitURL("test_data/git") 61 | assert.Nil(t, err) 62 | testFiler(t, filer) 63 | filer, err = FromGitURL("test_data/local2.git") 64 | assert.Nil(t, filer) 65 | assert.NotNil(t, err) 66 | } 67 | 68 | func TestZipFiler(t *testing.T) { 69 | filer, err := FromZIP("test_data/local.zip") 70 | assert.Nil(t, err) 71 | testFiler(t, filer) 72 | filer, err = FromZIP("test_data/local2.zip") 73 | assert.Nil(t, filer) 74 | assert.NotNil(t, err) 75 | } 76 | 77 | func TestNestedFiler(t *testing.T) { 78 | filer, err := FromDirectory("test_data/local") 79 | assert.Nil(t, err) 80 | filer2 := NestFiler(filer, "two") 81 | defer filer2.Close() 82 | files, err := filer2.ReadDir("") 83 | assert.Nil(t, err) 84 | assert.Len(t, files, 1) 85 | assert.Equal(t, "three", files[0].Name) 86 | assert.False(t, files[0].IsDir) 87 | content, err := filer2.ReadFile("three") 88 | assert.Nil(t, err) 89 | assert.Equal(t, "world\n", string(content)) 90 | } 91 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/COMMIT_EDITMSG: -------------------------------------------------------------------------------- 1 | init 2 | # Please enter the commit message for your changes. Lines starting 3 | # with '#' will be ignored, and an empty message aborts the commit. 4 | # 5 | # On branch master 6 | # 7 | # Initial commit 8 | # 9 | # Changes to be committed: 10 | # new file: one 11 | # new file: two/three 12 | # 13 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/HEAD: -------------------------------------------------------------------------------- 1 | ref: refs/heads/master 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/config: -------------------------------------------------------------------------------- 1 | [core] 2 | repositoryformatversion = 0 3 | filemode = true 4 | bare = false 5 | logallrefupdates = true 6 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/description: -------------------------------------------------------------------------------- 1 | Unnamed repository; edit this file 'description' to name the repository. 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/info/exclude: -------------------------------------------------------------------------------- 1 | # git ls-files --others --exclude-from=.git/info/exclude 2 | # Lines that start with '#' are comments. 3 | # For a project mostly in C, the following would be a good set of 4 | # exclude patterns (uncomment them if you want to use them): 5 | # *.[oa] 6 | # *~ 7 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/logs/HEAD: -------------------------------------------------------------------------------- 1 | 0000000000000000000000000000000000000000 334a82b19a7c893d3807ea52ba35ff2170c296cc Vadim Markovtsev 1525166017 +0200 commit (initial): init 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/logs/refs/heads/master: -------------------------------------------------------------------------------- 1 | 0000000000000000000000000000000000000000 334a82b19a7c893d3807ea52ba35ff2170c296cc Vadim Markovtsev 1525166017 +0200 commit (initial): init 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/objects/33/4a82b19a7c893d3807ea52ba35ff2170c296cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/git/objects/33/4a82b19a7c893d3807ea52ba35ff2170c296cc -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/objects/8a/9b00e3e4f0af606178510c19f40c5a77adb881: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/git/objects/8a/9b00e3e4f0af606178510c19f40c5a77adb881 -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/objects/cc/628ccd10742baea8241c5924df992b5c019f71: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/git/objects/cc/628ccd10742baea8241c5924df992b5c019f71 -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/objects/ce/013625030ba8dba906f756967f9e9ca394464a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/git/objects/ce/013625030ba8dba906f756967f9e9ca394464a -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/objects/f7/922e986704f99de62ca715d4794324a32e9af2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/git/objects/f7/922e986704f99de62ca715d4794324a32e9af2 -------------------------------------------------------------------------------- /licensedb/filer/test_data/git/refs/heads/master: -------------------------------------------------------------------------------- 1 | 334a82b19a7c893d3807ea52ba35ff2170c296cc 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/local.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/filer/test_data/local.zip -------------------------------------------------------------------------------- /licensedb/filer/test_data/local/one: -------------------------------------------------------------------------------- 1 | hello 2 | -------------------------------------------------------------------------------- /licensedb/filer/test_data/local/two/three: -------------------------------------------------------------------------------- 1 | world 2 | -------------------------------------------------------------------------------- /licensedb/internal/assets/extract_names.go: -------------------------------------------------------------------------------- 1 | // +build make 2 | 3 | package main 4 | 5 | import ( 6 | "encoding/csv" 7 | "encoding/json" 8 | "io/ioutil" 9 | "log" 10 | "os" 11 | "path" 12 | ) 13 | 14 | func main() { 15 | dir := os.Args[1] 16 | files, err := ioutil.ReadDir(dir) 17 | if err != nil { 18 | log.Fatalf("Listing %s: %v\n", dir, err) 19 | } 20 | writer := csv.NewWriter(os.Stdout) 21 | defer writer.Flush() 22 | for _, file := range files { 23 | var data map[string]interface{} 24 | content, err := ioutil.ReadFile(path.Join(dir, file.Name())) 25 | if err != nil { 26 | log.Fatalf("Reading %s: %v\n", file.Name(), err) 27 | } 28 | json.Unmarshal(content, &data) 29 | name := data["name"].(string) 30 | id := data["licenseId"].(string) 31 | writer.Write([]string{id, name}) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /licensedb/internal/assets/extract_urls.go: -------------------------------------------------------------------------------- 1 | // +build make 2 | 3 | package main 4 | 5 | import ( 6 | "encoding/csv" 7 | "encoding/json" 8 | "io/ioutil" 9 | "log" 10 | "os" 11 | "path" 12 | "strings" 13 | ) 14 | 15 | func main() { 16 | dir := os.Args[1] 17 | files, err := ioutil.ReadDir(dir) 18 | if err != nil { 19 | log.Fatalf("Listing %s: %v\n", dir, err) 20 | } 21 | writer := csv.NewWriter(os.Stdout) 22 | defer writer.Flush() 23 | for _, file := range files { 24 | var data map[string]interface{} 25 | content, err := ioutil.ReadFile(path.Join(dir, file.Name())) 26 | if err != nil { 27 | log.Fatalf("Reading %s: %v\n", file.Name(), err) 28 | } 29 | json.Unmarshal(content, &data) 30 | seeAlso := data["seeAlso"] 31 | if seeAlso != nil { 32 | for _, url := range seeAlso.([]interface{}) { 33 | id := data["licenseId"].(string) 34 | strUrl := strings.TrimSpace(url.(string)) 35 | cutIndex := strings.Index(strUrl, "://") 36 | schema := strUrl[:cutIndex] 37 | strUrl = strUrl[cutIndex:] // ignore http/https 38 | if strings.HasSuffix(strUrl, "/legalcode") && strings.HasPrefix(id, "CC") { 39 | strUrl = strUrl[:len(strUrl)-10] 40 | } 41 | writer.Write([]string{id, strUrl, schema}) 42 | } 43 | } 44 | } 45 | writer.Write([]string{"MIT", ".mit-license.org", "https"}) 46 | } 47 | -------------------------------------------------------------------------------- /licensedb/internal/db.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "archive/tar" 5 | "bytes" 6 | "encoding/csv" 7 | "errors" 8 | "fmt" 9 | "index/suffixarray" 10 | "io" 11 | "log" 12 | "os" 13 | paths "path" 14 | "regexp" 15 | "sort" 16 | "strings" 17 | 18 | minhashlsh "github.com/ekzhu/minhash-lsh" 19 | "github.com/sergi/go-diff/diffmatchpatch" 20 | 21 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 22 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/assets" 23 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/fastlog" 24 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/normalize" 25 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/wmh" 26 | ) 27 | 28 | // ErrUnknownLicenseID is raised if license identifier is not known. 29 | // Probably you need to upgrade version of the SPDX. 30 | var ErrUnknownLicenseID = errors.New("license id is not known") 31 | 32 | var ( 33 | licenseReadmeMentionRe = regexp.MustCompile( 34 | fmt.Sprintf("(?i)[^\\s]+/[^/\\s]*(%s)[^\\s]*", 35 | strings.Join(licenseFileNames, "|"))) 36 | ) 37 | 38 | // database holds the license texts, their hashes and the hashtables to query for nearest 39 | // neighbors. 40 | type database struct { 41 | debug bool 42 | 43 | // license name -> text 44 | licenseTexts map[string]string 45 | // minimum license text length 46 | minLicenseLength int 47 | // official license URL -> id 48 | idByURL map[string]string 49 | // id -> license URLs 50 | urlsByID map[string][]string 51 | // id -> license name 52 | nameByID map[string]string 53 | // all URLs joined 54 | urlRe *regexp.Regexp 55 | // first line of each license OR-ed - used to split 56 | firstLineRe *regexp.Regexp 57 | // unique unigrams -> index 58 | tokens map[string]int 59 | // document frequencies of the unigrams, indexes match with `tokens` 60 | docfreqs []int 61 | // Weighted MinHash hashtables 62 | lsh *minhashlsh.MinhashLSH 63 | // turns a license text into a hash 64 | hasher *wmh.WeightedMinHasher 65 | // part of license short name (e,g, BSL-1.0) -> list of containing license names 66 | nameShortSubstrings map[string][]substring 67 | // number of substrings per short license name 68 | nameShortSubstringSizes map[string]int 69 | // part of license name (e,g, Boost Software License 1.0) -> list of containing license names 70 | nameSubstrings map[string][]substring 71 | // number of substrings per license name 72 | nameSubstringSizes map[string]int 73 | } 74 | 75 | type substring struct { 76 | value string 77 | count int 78 | } 79 | 80 | const ( 81 | numHashes = 154 82 | similarityThreshold = 0.75 83 | ) 84 | 85 | // Length returns the number of registered licenses. 86 | func (db database) Length() int { 87 | return len(db.licenseTexts) 88 | } 89 | 90 | // VocabularySize returns the number of unique unigrams. 91 | func (db database) VocabularySize() int { 92 | return len(db.tokens) 93 | } 94 | 95 | func loadUrls(db *database) { 96 | urlCSVBytes, err := assets.Asset("urls.csv") 97 | if err != nil { 98 | log.Fatalf("failed to load urls.csv from the assets: %v", err) 99 | } 100 | urlReader := csv.NewReader(bytes.NewReader(urlCSVBytes)) 101 | records, err := urlReader.ReadAll() 102 | if err != nil || len(records) == 0 { 103 | log.Fatalf("failed to parse urls.csv from the assets: %v", err) 104 | } 105 | db.idByURL = map[string]string{} 106 | db.urlsByID = map[string][]string{} 107 | urlReWriter := &bytes.Buffer{} 108 | for i, record := range records { 109 | db.idByURL[record[1]] = record[0] 110 | db.urlsByID[record[0]] = append(db.urlsByID[record[0]], record[2]+record[1]) // schema+url 111 | urlReWriter.Write([]byte(regexp.QuoteMeta(record[1]))) 112 | if i < len(records)-1 { 113 | urlReWriter.WriteRune('|') 114 | } 115 | } 116 | db.urlRe = regexp.MustCompile(urlReWriter.String()) 117 | } 118 | 119 | func loadNames(db *database) { 120 | namesBytes, err := assets.Asset("names.csv") 121 | if err != nil { 122 | log.Fatalf("failed to load banes.csv from the assets: %v", err) 123 | } 124 | namesReader := csv.NewReader(bytes.NewReader(namesBytes)) 125 | records, err := namesReader.ReadAll() 126 | if err != nil || len(records) == 0 { 127 | log.Fatalf("failed to parse names.csv from the assets: %v", err) 128 | } 129 | db.nameByID = map[string]string{} 130 | db.nameSubstringSizes = map[string]int{} 131 | db.nameSubstrings = map[string][]substring{} 132 | for _, record := range records { 133 | db.nameByID[record[0]] = record[1] 134 | registerNameSubstrings(record[1], record[0], db.nameSubstringSizes, db.nameSubstrings) 135 | } 136 | } 137 | 138 | func registerNameSubstrings( 139 | name string, key string, sizes map[string]int, substrs map[string][]substring) { 140 | parts := splitLicenseName(name) 141 | sizes[key] = 0 142 | for _, part := range parts { 143 | if licenseReadmeRe.MatchString(part.value) { 144 | continue 145 | } 146 | sizes[key]++ 147 | list := substrs[part.value] 148 | if list == nil { 149 | list = []substring{} 150 | } 151 | list = append(list, substring{value: key, count: part.count}) 152 | substrs[part.value] = list 153 | } 154 | } 155 | 156 | // Load takes the licenses from the embedded storage, normalizes, hashes them and builds the 157 | // LSH hashtables. 158 | func loadLicenses() *database { 159 | db := &database{} 160 | if os.Getenv("LICENSE_DEBUG") != "" { 161 | db.debug = true 162 | } 163 | loadUrls(db) 164 | loadNames(db) 165 | tarBytes, err := assets.Asset("licenses.tar") 166 | if err != nil { 167 | log.Fatalf("failed to load licenses.tar from the assets: %v", err) 168 | } 169 | tarStream := bytes.NewBuffer(tarBytes) 170 | archive := tar.NewReader(tarStream) 171 | db.licenseTexts = map[string]string{} 172 | tokenFreqs := map[string]map[string]int{} 173 | firstLineWriter := &bytes.Buffer{} 174 | firstLineWriter.WriteString("(^|\\n)((.*licen[cs]e\\n\\n)|(") 175 | for header, err := archive.Next(); err != io.EOF; header, err = archive.Next() { 176 | if len(header.Name) <= 6 { 177 | continue 178 | } 179 | key := header.Name[2 : len(header.Name)-4] 180 | text := make([]byte, header.Size) 181 | readSize, readErr := archive.Read(text) 182 | if readErr != nil && readErr != io.EOF { 183 | log.Fatalf("failed to load licenses.tar from the assets: %s: %v", header.Name, readErr) 184 | } 185 | if int64(readSize) != header.Size { 186 | log.Fatalf("failed to load licenses.tar from the assets: %s: incomplete read", header.Name) 187 | } 188 | normedText := normalize.LicenseText(string(text), normalize.Moderate) 189 | if db.minLicenseLength == 0 || db.minLicenseLength > len(normedText) { 190 | db.minLicenseLength = len(normedText) 191 | } 192 | db.licenseTexts[key] = normedText 193 | newLinePos := strings.Index(normedText, "\n") 194 | if newLinePos >= 0 { 195 | firstLineWriter.WriteString(regexp.QuoteMeta(normedText[:newLinePos])) 196 | firstLineWriter.WriteRune('|') 197 | } 198 | normedText = normalize.Relax(normedText) 199 | lines := strings.Split(normedText, "\n") 200 | myUniqueTokens := map[string]int{} 201 | tokenFreqs[key] = myUniqueTokens 202 | for _, line := range lines { 203 | tokens := strings.Split(line, " ") 204 | for _, token := range tokens { 205 | myUniqueTokens[token]++ 206 | } 207 | } 208 | } 209 | if db.debug { 210 | log.Println("Minimum license length:", db.minLicenseLength) 211 | log.Println("Number of supported licenses:", len(db.licenseTexts)) 212 | } 213 | firstLineWriter.Truncate(firstLineWriter.Len() - 1) 214 | firstLineWriter.WriteString("))") 215 | db.firstLineRe = regexp.MustCompile(firstLineWriter.String()) 216 | docfreqs := map[string]int{} 217 | for _, tokens := range tokenFreqs { 218 | for token := range tokens { 219 | docfreqs[token]++ 220 | } 221 | } 222 | uniqueTokens := make([]string, len(docfreqs)) 223 | { 224 | i := 0 225 | for token := range docfreqs { 226 | uniqueTokens[i] = token 227 | i++ 228 | } 229 | } 230 | sort.Strings(uniqueTokens) 231 | db.tokens = map[string]int{} 232 | db.docfreqs = make([]int, len(uniqueTokens)) 233 | for i, token := range uniqueTokens { 234 | db.tokens[token] = i 235 | db.docfreqs[i] = docfreqs[token] 236 | } 237 | db.lsh = minhashlsh.NewMinhashLSH64(numHashes, similarityThreshold, len(tokenFreqs)) 238 | if db.debug { 239 | k, l := db.lsh.Params() 240 | log.Println("LSH:", k, l) 241 | } 242 | db.hasher = wmh.NewWeightedMinHasher(len(uniqueTokens), numHashes, 7) 243 | db.nameShortSubstrings = map[string][]substring{} 244 | db.nameShortSubstringSizes = map[string]int{} 245 | for key, tokens := range tokenFreqs { 246 | indices := make([]int, len(tokens)) 247 | values := make([]float32, len(tokens)) 248 | { 249 | i := 0 250 | for t, freq := range tokens { 251 | indices[i] = db.tokens[t] 252 | values[i] = tfidf(freq, db.docfreqs[indices[i]], len(db.licenseTexts)) 253 | i++ 254 | } 255 | } 256 | db.lsh.Add(key, db.hasher.Hash(values, indices)) 257 | registerNameSubstrings(key, key, db.nameShortSubstringSizes, db.nameShortSubstrings) 258 | } 259 | db.lsh.Index() 260 | return db 261 | } 262 | 263 | // QueryLicenseText returns the most similar registered licenses. 264 | func (db *database) QueryLicenseText(text string) map[string]float32 { 265 | parts := normalize.Split(text) 266 | licenses := map[string]float32{} 267 | for _, part := range parts { 268 | for key, val := range db.queryLicenseAbstract(part) { 269 | if licenses[key] < val { 270 | licenses[key] = val 271 | } 272 | } 273 | } 274 | return licenses 275 | } 276 | 277 | func (db *database) queryLicenseAbstract(text string) map[string]float32 { 278 | normalizedModerate := normalize.LicenseText(text, normalize.Moderate) 279 | titlePositions := db.firstLineRe.FindAllStringIndex(normalizedModerate, -1) 280 | candidates := db.queryLicenseAbstractNormalized(normalizedModerate) 281 | var prevPos int 282 | var prevMatch string 283 | for i, titlePos := range titlePositions { 284 | begPos := titlePos[0] 285 | match := normalizedModerate[titlePos[0]:titlePos[1]] 286 | if len(match) == 0 { 287 | continue 288 | } 289 | if match[0] == '\n' { 290 | match = match[1:] 291 | } 292 | if match == prevMatch { 293 | begPos = prevPos 294 | } 295 | if normalizedModerate[begPos] == '\n' { 296 | begPos++ 297 | } 298 | var endPos int 299 | if i < len(titlePositions)-1 { 300 | endPos = titlePositions[i+1][0] 301 | } else { 302 | endPos = len(normalizedModerate) 303 | } 304 | part := normalizedModerate[begPos:endPos] 305 | prevMatch = match 306 | prevPos = begPos 307 | if float64(len(part)) < float64(db.minLicenseLength)*similarityThreshold { 308 | continue 309 | } 310 | newCandidates := db.queryLicenseAbstractNormalized(part) 311 | if len(newCandidates) == 0 { 312 | continue 313 | } 314 | for key, val := range newCandidates { 315 | if candidates[key] < val { 316 | candidates[key] = val 317 | } 318 | } 319 | } 320 | db.addURLMatches(candidates, text) 321 | return candidates 322 | } 323 | 324 | func (db *database) addURLMatches(candidates map[string]float32, text string) { 325 | for key := range db.scanForURLs(text) { 326 | if db.debug { 327 | println("URL:", key) 328 | } 329 | if conf := candidates[key]; conf < similarityThreshold { 330 | if conf == 0 { 331 | candidates[key] = 1 332 | } else { 333 | candidates[key] = similarityThreshold 334 | } 335 | } 336 | } 337 | } 338 | 339 | func (db *database) queryLicenseAbstractNormalized(normalizedModerate string) map[string]float32 { 340 | normalizedRelaxed := normalize.Relax(normalizedModerate) 341 | if db.debug { 342 | println("\nqueryAbstractNormed --------\n") 343 | println(normalizedModerate) 344 | println("\n========\n") 345 | println(normalizedRelaxed) 346 | } 347 | tokens := map[int]int{} 348 | for _, line := range strings.Split(normalizedRelaxed, "\n") { 349 | for _, token := range strings.Split(line, " ") { 350 | if index, exists := db.tokens[token]; exists { 351 | tokens[index]++ 352 | } 353 | } 354 | } 355 | indices := make([]int, len(tokens)) 356 | values := make([]float32, len(tokens)) 357 | { 358 | i := 0 359 | for key, val := range tokens { 360 | indices[i] = key 361 | values[i] = tfidf(val, db.docfreqs[key], len(db.licenseTexts)) 362 | i++ 363 | } 364 | } 365 | found := db.lsh.Query(db.hasher.Hash(values, indices)) 366 | candidates := map[string]float32{} 367 | if len(found) == 0 { 368 | return candidates 369 | } 370 | for _, keyint := range found { 371 | key := keyint.(string) 372 | licenseText := db.licenseTexts[key] 373 | yourRunes := make([]rune, 0, len(licenseText)/6) 374 | vocabulary := map[string]int{} 375 | for _, line := range strings.Split(licenseText, "\n") { 376 | for _, token := range strings.Split(line, " ") { 377 | index, exists := vocabulary[token] 378 | if !exists { 379 | index = len(vocabulary) 380 | vocabulary[token] = index 381 | } 382 | yourRunes = append(yourRunes, rune(index)) 383 | } 384 | } 385 | 386 | oovRune := rune(len(vocabulary)) 387 | myRunes := make([]rune, 0, len(normalizedModerate)/6) 388 | for _, line := range strings.Split(normalizedModerate, "\n") { 389 | for _, token := range strings.Split(line, " ") { 390 | if index, exists := vocabulary[token]; exists { 391 | myRunes = append(myRunes, rune(index)) 392 | } else if len(myRunes) == 0 || myRunes[len(myRunes)-1] != oovRune { 393 | myRunes = append(myRunes, oovRune) 394 | } 395 | } 396 | } 397 | 398 | dmp := diffmatchpatch.New() 399 | diff := dmp.DiffMainRunes(myRunes, yourRunes, false) 400 | 401 | if db.debug { 402 | tokarr := make([]string, len(db.tokens)+1) 403 | for key, val := range vocabulary { 404 | tokarr[val] = key 405 | } 406 | tokarr[len(db.tokens)] = "!" 407 | println(dmp.DiffPrettyText(dmp.DiffCharsToLines(diff, tokarr))) 408 | } 409 | distance := dmp.DiffLevenshtein(diff) 410 | candidates[key] = float32(1) - float32(distance)/float32(len(myRunes)) 411 | } 412 | weak := make([]string, 0, len(candidates)) 413 | for key, val := range candidates { 414 | if val < similarityThreshold { 415 | weak = append(weak, key) 416 | } 417 | } 418 | if len(weak) < len(candidates) { 419 | for _, key := range weak { 420 | delete(candidates, key) 421 | } 422 | } 423 | return candidates 424 | } 425 | 426 | func (db *database) scanForURLs(text string) map[string]bool { 427 | byteText := []byte(text) 428 | index := suffixarray.New(byteText) 429 | urlMatches := index.FindAllIndex(db.urlRe, -1) 430 | licenses := map[string]bool{} 431 | for _, match := range urlMatches { 432 | url := byteText[match[0]:match[1]] 433 | licenses[db.idByURL[string(url)]] = true 434 | } 435 | return licenses 436 | } 437 | 438 | // QueryReadmeText tries to detect licenses mentioned in the README. 439 | func (db *database) QueryReadmeText(text string, fs filer.Filer) map[string]float32 { 440 | candidates := map[string]float32{} 441 | append := func(others map[string]float32) { 442 | for key, val := range others { 443 | if candidates[key] < val { 444 | candidates[key] = val 445 | } 446 | } 447 | } 448 | for _, match := range licenseReadmeMentionRe.FindAllString(text, -1) { 449 | match = strings.TrimRight(match, ".,:;-") 450 | content, err := fs.ReadFile(match) 451 | if err == nil { 452 | if preprocessor, exists := filePreprocessors[paths.Ext(match)]; exists { 453 | content = preprocessor(content) 454 | } 455 | append(db.QueryLicenseText(string(content))) 456 | } 457 | } 458 | if len(candidates) == 0 { 459 | append(investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)) 460 | append(investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)) 461 | } 462 | if db.debug { 463 | for key, val := range candidates { 464 | println("NLP", key, val) 465 | } 466 | } 467 | db.addURLMatches(candidates, text) 468 | return candidates 469 | } 470 | 471 | // URLs returns the list of the URLs for the given license identifier 472 | func (db *database) URLs(id string) ([]string, error) { 473 | urls, found := db.urlsByID[id] 474 | if !found { 475 | return nil, ErrUnknownLicenseID 476 | } 477 | res := make([]string, len(urls)) 478 | copy(res, urls) 479 | return urls, nil 480 | } 481 | 482 | // Name returns the SPDX name for the license identifier 483 | func (db *database) Name(id string) (string, error) { 484 | name, found := db.nameByID[id] 485 | if !found { 486 | return "", ErrUnknownLicenseID 487 | } 488 | return name, nil 489 | } 490 | 491 | func tfidf(freq int, docfreq int, ndocs int) float32 { 492 | weight := fastlog.Log(1+float32(freq)) * fastlog.Log(float32(ndocs)/float32(docfreq)) 493 | if weight < 0 { 494 | // logarithm is approximate 495 | return 0 496 | } 497 | return weight 498 | } 499 | -------------------------------------------------------------------------------- /licensedb/internal/fastlog/fastlog.go: -------------------------------------------------------------------------------- 1 | package fastlog 2 | 3 | import "math" 4 | 5 | // The following two functions were copied from fastapprox (BSD license). 6 | // They do not calculate the precise value - and we do not need it. 7 | 8 | // Log2 calculates the approximate base-2 logarithm. 9 | func Log2(x float32) float32 { 10 | vx := math.Float32bits(x) 11 | mx := math.Float32frombits((vx & 0x007FFFFF) | 0x3f000000) 12 | y := float32(vx) * 1.1920928955078125e-7 13 | return y - 124.22551499 - 1.498030302*mx - 1.72587999/(0.3520887068+mx) 14 | } 15 | 16 | // Log calculates the approximate natural logarithm. 17 | func Log(x float32) float32 { 18 | return 0.69314718 * Log2(x) 19 | } 20 | -------------------------------------------------------------------------------- /licensedb/internal/fastlog/fastlog_test.go: -------------------------------------------------------------------------------- 1 | package fastlog 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestFastlog(t *testing.T) { 11 | tests := []float32{ 12 | 0.1, 0.5, 0.9, 1.0, 1.1, 2.0, 2.718281828, 3.0, 4.0, 13 | 10.0, 20.0, 100.0, 500.0, 1000.0, 14 | } 15 | for _, v := range tests { 16 | flog := Log(v) 17 | plog := float32(math.Log(float64(v))) 18 | if plog != 0 { 19 | assert.InEpsilon(t, plog, flog, 0.002) 20 | } else { 21 | assert.InDelta(t, plog, flog, 0.000002) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /licensedb/internal/investigation.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | paths "path" 7 | "regexp" 8 | "strings" 9 | "sync" 10 | 11 | "github.com/go-enry/go-license-detector/v4/licensedb/api" 12 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 13 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/processors" 14 | ) 15 | 16 | var ( 17 | globalLicenseDB struct { 18 | sync.Once 19 | *database 20 | } 21 | globalLicenseDatabase = func() *database { 22 | globalLicenseDB.Once.Do(func() { 23 | globalLicenseDB.database = loadLicenses() 24 | }) 25 | return globalLicenseDB.database 26 | } 27 | 28 | // Base names of guessable license files 29 | licenseFileNames = []string{ 30 | "li[cs]en[cs]e(s?)", 31 | "legal", 32 | "copy(left|right|ing)", 33 | "unlicense", 34 | "l?gpl([-_ v]?)(\\d\\.?\\d)?", 35 | "bsd", 36 | "mit", 37 | "apache", 38 | } 39 | 40 | // License file extensions. Combined with the fileNames slice 41 | // to create a set of files we can reasonably assume contain 42 | // licensing information. 43 | fileExtensions = []string{ 44 | "", 45 | ".md", 46 | ".rst", 47 | ".html", 48 | ".txt", 49 | } 50 | 51 | filePreprocessors = map[string]func([]byte) []byte{ 52 | ".md": processors.Markdown, 53 | ".rst": processors.RestructuredText, 54 | ".html": processors.HTML, 55 | } 56 | 57 | licenseFileRe = regexp.MustCompile( 58 | fmt.Sprintf("^(|.*[-_. ])(%s)(|[-_. ].*)$", 59 | strings.Join(licenseFileNames, "|"))) 60 | 61 | readmeFileRe = regexp.MustCompile(fmt.Sprintf("^(readme|guidelines)(%s)$", 62 | strings.Replace(strings.Join(fileExtensions, "|"), ".", "\\.", -1))) 63 | 64 | licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf( 65 | "^(%s)$", strings.Join(licenseFileNames, "|"))) 66 | ) 67 | 68 | func investigateCandidates(candidates map[string][]byte, f func(text []byte) map[string]float32) map[string]api.Match { 69 | matches := make(map[string]api.Match) 70 | for file, text := range candidates { 71 | candidates := f(text) 72 | for name, sim := range candidates { 73 | match := matches[name] 74 | if match.Files == nil { 75 | match.Files = make(map[string]float32) 76 | } 77 | match.Files[file] = sim 78 | if sim > match.Confidence { 79 | match.Confidence = sim 80 | match.File = file 81 | } 82 | matches[name] = match 83 | } 84 | } 85 | return matches 86 | } 87 | 88 | // ExtractLicenseFiles returns the list of possible license texts. 89 | // The file names are matched against the template. 90 | // Reader is used to to read file contents. 91 | func ExtractLicenseFiles(files []string, fs filer.Filer) map[string][]byte { 92 | candidates := make(map[string][]byte) 93 | for _, file := range files { 94 | if licenseFileRe.MatchString(strings.ToLower(paths.Base(file))) { 95 | text, err := fs.ReadFile(file) 96 | if len(text) < 128 { 97 | // e.g. https://github.com/Unitech/pm2/blob/master/LICENSE 98 | realText, err := fs.ReadFile(string(bytes.TrimSpace(text))) 99 | if err == nil { 100 | file = string(bytes.TrimSpace(text)) 101 | text = realText 102 | } 103 | } 104 | if err == nil { 105 | if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { 106 | text = preprocessor(text) 107 | } 108 | candidates[file] = text 109 | } 110 | } 111 | } 112 | return candidates 113 | } 114 | 115 | // InvestigateLicenseTexts takes the list of candidate license texts and returns the most probable 116 | // reference licenses matched. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. 117 | // Furthermore, each match contains a mapping of filename to the confidence that file produced. 118 | func InvestigateLicenseTexts(candidates map[string][]byte) map[string]api.Match { 119 | return investigateCandidates(candidates, InvestigateLicenseText) 120 | } 121 | 122 | // InvestigateLicenseText takes the license text and returns the most probable reference licenses matched. 123 | // Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. 124 | func InvestigateLicenseText(text []byte) map[string]float32 { 125 | return globalLicenseDatabase().QueryLicenseText(string(text)) 126 | } 127 | 128 | // ExtractReadmeFiles searches for README files. 129 | // Reader is used to to read file contents. 130 | func ExtractReadmeFiles(files []string, fs filer.Filer) map[string][]byte { 131 | candidates := make(map[string][]byte) 132 | for _, file := range files { 133 | if readmeFileRe.MatchString(strings.ToLower(file)) { 134 | text, err := fs.ReadFile(file) 135 | if err == nil { 136 | if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { 137 | text = preprocessor(text) 138 | } 139 | candidates[file] = text 140 | } 141 | } 142 | } 143 | return candidates 144 | } 145 | 146 | // InvestigateReadmeTexts scans README files for licensing information and outputs the 147 | // probable names using NER. 148 | func InvestigateReadmeTexts(candidtes map[string][]byte, fs filer.Filer) map[string]api.Match { 149 | return investigateCandidates(candidtes, func(text []byte) map[string]float32 { 150 | return InvestigateReadmeText(text, fs) 151 | }) 152 | } 153 | 154 | // InvestigateReadmeText scans the README file for licensing information and outputs probable 155 | // names found with Named Entity Recognition from NLP. 156 | func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 { 157 | return globalLicenseDatabase().QueryReadmeText(string(text), fs) 158 | } 159 | 160 | // IsLicenseDirectory indicates whether the directory is likely to contain licenses. 161 | func IsLicenseDirectory(fileName string) bool { 162 | return licenseDirectoryRe.MatchString(strings.ToLower(fileName)) 163 | } 164 | 165 | // Preload license database 166 | func Preload() { 167 | _ = globalLicenseDatabase() 168 | } 169 | 170 | // LookupURLs returns the list of URLs for the given license identifier 171 | func LookupURLs(id string) ([]string, error) { 172 | return globalLicenseDatabase().URLs(id) 173 | } 174 | 175 | // LookupName returns the SPDX name for the given license identifier 176 | func LookupName(id string) (string, error) { 177 | return globalLicenseDatabase().Name(id) 178 | } 179 | -------------------------------------------------------------------------------- /licensedb/internal/nlp.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "regexp" 5 | "sort" 6 | "strings" 7 | "sync" 8 | 9 | "github.com/jdkato/prose/chunk" 10 | "github.com/jdkato/prose/tag" 11 | "github.com/jdkato/prose/tokenize" 12 | ) 13 | 14 | var ( 15 | licenseMarkReadmeRe = regexp.MustCompile(`(?i)(copy(right|ing))|\(c\)|©|(licen[cs][ei])|released under`) 16 | garbageReadmeRe = regexp.MustCompile(`([Cc]opy(right|ing))|\(c\)|©`) 17 | licenseReadmeRe = regexp.MustCompile(`\s*[Ll]icen[cs]e\s*`) 18 | licenseNamePartRe = regexp.MustCompile(`([a-z]+)|([0-9]+)`) 19 | digitsRe = regexp.MustCompile(`[0-9]+`) 20 | disabledNamePartsRe = regexp.MustCompile(`clause|or|only|deprecated|later`) 21 | 22 | tagger = tag.NewPerceptronTagger() 23 | chunkLock sync.Mutex 24 | ) 25 | 26 | // investigateReadmeFile uses NER to match license name mentions. 27 | // It takes two arguments: licenseNameParts and licenseNameSizes. 28 | // The idea is to map substrings to real licenses, and the confidence is 29 | // / . 30 | func investigateReadmeFile( 31 | text string, licenseNameParts map[string][]substring, 32 | licenseNameSizes map[string]int) map[string]float32 { 33 | matches := licenseMarkReadmeRe.FindAllStringIndex(text, -1) 34 | if len(matches) == 0 { 35 | return map[string]float32{} 36 | } 37 | 38 | // shoot in the dark. Is it a license text? 39 | beginIndex := matches[0][0] 40 | for ; beginIndex >= 1 && text[beginIndex-1:beginIndex+1] != "\n\n"; beginIndex-- { 41 | } 42 | endIndex := matches[len(matches)-1][1] 43 | for ; endIndex < len(text)-1 && text[endIndex:endIndex+2] != "\n\n"; endIndex++ { 44 | } 45 | candidates := globalLicenseDatabase().QueryLicenseText(text[beginIndex:endIndex]) 46 | 47 | beginIndex = matches[0][0] 48 | endIndex = beginIndex + 50 49 | if len(matches) > 1 { 50 | endIndex = matches[len(matches)-1][1] 51 | } else { 52 | beginIndex -= 50 53 | if beginIndex < 0 { 54 | beginIndex = 0 55 | } else { 56 | for ; text[beginIndex] != ' ' && text[beginIndex] != '\t' && 57 | text[beginIndex] != '\n' && beginIndex < matches[0][0]; beginIndex++ { 58 | } 59 | } 60 | for ; endIndex < len(text) && text[endIndex] != ' ' && text[endIndex] != '\t' && 61 | text[endIndex] != '\n'; endIndex++ { 62 | } 63 | } 64 | if endIndex > len(text) { 65 | endIndex = len(text) 66 | } 67 | suspectedText := text[beginIndex:endIndex] 68 | suspectedWords := tokenize.TextToWords(suspectedText) 69 | chunks := readmeChunks(tagger.Tag(suspectedWords)) 70 | for _, entity := range chunks { 71 | if garbageReadmeRe.MatchString(entity) { 72 | continue 73 | } 74 | scores := map[string]map[string]int{} 75 | entity = licenseReadmeRe.ReplaceAllString(entity, " ") 76 | substrs := splitLicenseName(entity) 77 | for _, substr := range substrs { 78 | for _, match := range licenseNameParts[substr.value] { 79 | common := match.count 80 | if substr.count < common { 81 | common = substr.count 82 | } 83 | matchSubstrs := scores[match.value] 84 | if matchSubstrs == nil { 85 | matchSubstrs = map[string]int{} 86 | scores[match.value] = matchSubstrs 87 | } 88 | matchSubstrs[substr.value] = common 89 | } 90 | } 91 | // if the only reason a license matched is a single digit, drop it 92 | toRemove := []string{} 93 | for key, matchSubstrs := range scores { 94 | if len(matchSubstrs) == 1 { 95 | for substr := range matchSubstrs { 96 | if digitsRe.MatchString(substr) { 97 | toRemove = append(toRemove, key) 98 | } 99 | } 100 | } 101 | } 102 | for _, key := range toRemove { 103 | delete(scores, key) 104 | } 105 | for key, val := range scores { 106 | matchSize := 0 107 | for _, n := range val { 108 | matchSize += n 109 | } 110 | confidence := float32(matchSize) / float32(licenseNameSizes[key]) 111 | if candidates[key] < confidence && confidence >= 0.3 { 112 | candidates[key] = confidence 113 | } 114 | } 115 | } 116 | return candidates 117 | } 118 | 119 | func readmeChunks(tokens []tag.Token) []string { 120 | chunkLock.Lock() 121 | defer chunkLock.Unlock() 122 | return chunk.Chunk(tokens, chunk.TreebankNamedEntities) 123 | } 124 | 125 | func splitLicenseName(name string) []substring { 126 | counts := map[string]int{} 127 | parts := licenseNamePartRe.FindAllString(strings.ToLower(name), -1) 128 | for i, part := range parts { 129 | if part[len(part)-1] == 'v' && i < len(parts)-1 && digitsRe.MatchString(parts[i+1]) { 130 | part = part[:len(part)-1] 131 | if len(part) == 0 { 132 | continue 133 | } 134 | } 135 | if disabledNamePartsRe.MatchString(part) { 136 | continue 137 | } 138 | // BSD hack 139 | if part == "simplified" { 140 | part = "2" 141 | } 142 | counts[part]++ 143 | } 144 | result := make([]substring, len(counts)) 145 | i := 0 146 | for key, val := range counts { 147 | result[i] = substring{value: key, count: val} 148 | i++ 149 | } 150 | sort.Slice(result, func(i int, j int) bool { 151 | return result[i].value > result[j].value 152 | }) 153 | return result 154 | } 155 | -------------------------------------------------------------------------------- /licensedb/internal/nlp_test.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestSplitLicenseName(t *testing.T) { 10 | assert.Equal(t, []substring{ 11 | {"gpl", 1}, 12 | {"3", 1}, 13 | }, splitLicenseName("GPLv3")) 14 | assert.Equal(t, []substring{ 15 | {"gpl", 1}, 16 | {"3", 2}, 17 | }, splitLicenseName("GPL-3-3")) 18 | assert.Equal(t, []substring{ 19 | {"apache", 1}, 20 | {"2", 1}, 21 | {"0", 1}, 22 | }, splitLicenseName("Apache-2.0")) 23 | assert.Equal(t, []substring{ 24 | {"bsd", 1}, 25 | {"2", 1}, 26 | }, splitLicenseName("Simplified BSD")) 27 | assert.Equal(t, []substring{ 28 | {"gpl", 1}, 29 | {"2", 1}, 30 | }, splitLicenseName("GPL-2-deprecated")) 31 | } 32 | -------------------------------------------------------------------------------- /licensedb/internal/normalize/normalize.go: -------------------------------------------------------------------------------- 1 | package normalize 2 | 3 | import ( 4 | "bytes" 5 | "regexp" 6 | "strings" 7 | "unicode" 8 | 9 | "golang.org/x/text/runes" 10 | "golang.org/x/text/transform" 11 | "golang.org/x/text/unicode/norm" 12 | ) 13 | 14 | var ( 15 | lineEndingsRe = regexp.MustCompile(`\r\n?`) 16 | // 3.1.1 All whitespace should be treated as a single blank space. 17 | whitespaceRe = regexp.MustCompile(`[ \t\f\r              ​]+`) 18 | trailingWhitespaceRe = regexp.MustCompile(`(?m)[ \t\f\r              ​]$`) 19 | licenseHeaderRe = regexp.MustCompile(`(licen[cs]e)\.?\n\n`) 20 | leadingWhitespaceRe = regexp.MustCompile(`(?m)^(( \n?)|\n)`) 21 | // 5.1.2 Hyphens, Dashes Any hyphen, dash, en dash, em dash, or other variation should be 22 | // considered equivalent. 23 | punctuationRe = regexp.MustCompile(`[-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-]+`) 24 | // 5.1.3 Quotes Any variation of quotations (single, double, curly, etc.) should be considered 25 | // equivalent. 26 | quotesRe = regexp.MustCompile(`["'“”‘’„‚«»‹›❛❜❝❞\x60]+`) 27 | // 7.1.1 Where a line starts with a bullet, number, letter, or some form of a list item 28 | // (determined where list item is followed by a space, then the text of the sentence), ignore 29 | // the list item for matching purposes. 30 | bulletRe = regexp.MustCompile(`(?m)^(([-*✱﹡•●⚫⏺🞄∙⋅])|([(\[{]?\d+[.)\]}] ?)|([(\[{]?[a-z][.)\]}] ?)|([(\[{]?i+[.)\]} ] ?))`) 31 | // 8.1.1 The words in the following columns are considered equivalent and interchangeable. 32 | wordReplacer = strings.NewReplacer( 33 | "acknowledgment", "acknowledgement", 34 | "analogue", "analog", 35 | "analyse", "analyze", 36 | "artefact", "artifact", 37 | "authorisation", "authorization", 38 | "authorised", "authorized", 39 | "calibre", "caliber", 40 | "cancelled", "canceled", 41 | "capitalisations", "capitalizations", 42 | "catalogue", "catalog", 43 | "categorise", "categorize", 44 | "centre", "center", 45 | "emphasised", "emphasized", 46 | "favour", "favor", 47 | "favourite", "favorite", 48 | "fulfil", "fulfill", 49 | "fulfilment", "fulfillment", 50 | "initialise", "initialize", 51 | "judgment", "judgement", 52 | "labelling", "labeling", 53 | "labour", "labor", 54 | "licence", "license", 55 | "maximise", "maximize", 56 | "modelled", "modeled", 57 | "modelling", "modeling", 58 | "offence", "offense", 59 | "optimise", "optimize", 60 | "organisation", "organization", 61 | "organise", "organize", 62 | "practise", "practice", 63 | "programme", "program", 64 | "realise", "realize", 65 | "recognise", "recognize", 66 | "signalling", "signaling", 67 | "sub-license", "sublicense", 68 | "sub license", "sub-license", 69 | "utilisation", "utilization", 70 | "whilst", "while", 71 | "wilful", "wilfull", 72 | "non-commercial", "noncommercial", 73 | "per cent", "percent", 74 | "copyright owner", "copyright", 75 | ) 76 | 77 | // 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable. 78 | copyrightRe = regexp.MustCompile(`copyright|\(c\)`) 79 | trademarkRe = regexp.MustCompile(`trademark(s?)|\(tm\)`) 80 | 81 | // extra cleanup 82 | brokenLinkRe = regexp.MustCompile(`http s ://`) 83 | urlCleanupRe = regexp.MustCompile(`[<(](http(s?)://[^\s]+)[)>]`) 84 | copyrightLineRe = regexp.MustCompile(`(?m)^((©.*)|(all rights reserved(\.)?)|(li[cs]en[cs]e))\n`) 85 | nonAlphaNumRe = regexp.MustCompile(`[^- \na-z0-9]`) 86 | 87 | // used in Split() 88 | splitRe = regexp.MustCompile(`\n\s*[^a-zA-Z0-9_,()]{3,}\s*\n`) 89 | ) 90 | 91 | // Strictness represents the aggressiveness of the performed normalization. The bigger the number, 92 | // the more aggressive. See `Enforced`, `Moderate` and `Relaxed`. 93 | type Strictness int 94 | 95 | const ( 96 | // Enforced is the strictest mode - only the official SPDX guidelines are applied. 97 | Enforced Strictness = 0 98 | // Moderate is equivalent to Enforced with some additional normalization: dots are removed, copyright lines too. 99 | Moderate Strictness = 1 100 | // Relaxed is the most powerful normalization, Moderate + Unicode normalization and all non-alphanumeric chars removed. 101 | Relaxed Strictness = 2 102 | ) 103 | 104 | // LicenseText makes a license text ready for analysis. 105 | // It follows SPDX guidelines at 106 | // https://spdx.org/spdx-license-list/matching-guidelines 107 | func LicenseText(text string, strictness Strictness) string { 108 | // Line endings 109 | text = lineEndingsRe.ReplaceAllString(text, "\n") 110 | 111 | // 4. Capitalization 112 | text = strings.ToLower(text) 113 | 114 | // 3. Whitespace 115 | text = whitespaceRe.ReplaceAllString(text, " ") 116 | text = trailingWhitespaceRe.ReplaceAllString(text, "") 117 | text = licenseHeaderRe.ReplaceAllString(text, "$1\nthisislikelyalicenseheaderplaceholder\n") 118 | text = leadingWhitespaceRe.ReplaceAllString(text, "") 119 | 120 | // 5. Punctuation 121 | text = punctuationRe.ReplaceAllString(text, "-") 122 | text = quotesRe.ReplaceAllString(text, "\"") 123 | 124 | // 7. Bullets and Numbering 125 | text = bulletRe.ReplaceAllString(text, "") 126 | 127 | // 8. Varietal Word Spelling 128 | text = wordReplacer.Replace(text) 129 | 130 | // 9. Copyright Symbol 131 | text = copyrightRe.ReplaceAllString(text, "©") 132 | text = trademarkRe.ReplaceAllString(text, "™") 133 | 134 | // fix broken URLs in SPDX source texts 135 | text = brokenLinkRe.ReplaceAllString(text, "https://") 136 | 137 | // fix URLs in <> - erase the decoration 138 | text = urlCleanupRe.ReplaceAllString(text, "$1") 139 | 140 | // collapse several non-alphanumeric characters 141 | { 142 | buffer := &bytes.Buffer{} 143 | back := '\x00' 144 | for _, char := range text { 145 | if !unicode.IsLetter(char) && !unicode.IsDigit(char) && back == char { 146 | continue 147 | } 148 | back = char 149 | buffer.WriteRune(char) 150 | } 151 | text = buffer.String() 152 | } 153 | 154 | if strictness > Enforced { 155 | // there are common mismatches because of trailing dots 156 | text = strings.Replace(text, ".", "", -1) 157 | // usually copyright lines are custom and occur multiple times 158 | text = copyrightLineRe.ReplaceAllString(text, "") 159 | } 160 | 161 | if strictness > Moderate { 162 | return Relax(text) 163 | } 164 | 165 | text = leadingWhitespaceRe.ReplaceAllString(text, "") 166 | text = strings.Replace(text, "thisislikelyalicenseheaderplaceholder", "", -1) 167 | 168 | return text 169 | } 170 | 171 | // Relax applies very aggressive normalization rules to text. 172 | func Relax(text string) string { 173 | buffer := &bytes.Buffer{} 174 | writer := transform.NewWriter( 175 | buffer, transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)) 176 | _, _ = writer.Write([]byte(text)) 177 | _ = writer.Close() 178 | text = buffer.String() 179 | text = nonAlphaNumRe.ReplaceAllString(text, "") 180 | text = leadingWhitespaceRe.ReplaceAllString(text, "") 181 | text = strings.Replace(text, " ", " ", -1) 182 | return text 183 | } 184 | 185 | // Split applies heuristics to split the text into several parts 186 | func Split(text string) []string { 187 | result := []string{text} 188 | 189 | // Always add the full text 190 | splitted := splitRe.Split(text, -1) 191 | if len(splitted) > 1 { 192 | result = append(result, splitted...) 193 | } 194 | return result 195 | } 196 | -------------------------------------------------------------------------------- /licensedb/internal/normalize/normalize_test.go: -------------------------------------------------------------------------------- 1 | package normalize 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestNormalizeLines(t *testing.T) { 10 | tt := []struct { 11 | name string 12 | in, out string 13 | }{ 14 | {"lines", "a\r\nb\rc\n\r", "a\nb\nc\n"}, 15 | {"whitespace", " a\n b\nc \n", "a\nb\nc\n"}, 16 | {"quotes lowercase", 17 | `“You” (or “Your”) shall mean an individual or Legal Entity exercising 18 | permissions granted by this License.`, 19 | `"you" (or "your") shall mean an individual or legal entity exercising 20 | permissions granted by this license.`}, 21 | {"normalize links", "A B", "a https:/fsf.org/ b"}, 22 | {"license", "license.\n\nlicence\n\n", "license\n\nlicense\n\n"}, 23 | {"punctuation", "a-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-", "a-"}, 24 | {"bullet", "-\n*\n✱\n﹡\n•\n●\n⚫\n⏺\n🞄\n∙\n⋅\n", ""}, 25 | {"license", "", ""}, 26 | } 27 | 28 | for _, tc := range tt { 29 | assert.Equal(t, tc.out, LicenseText(tc.in, Enforced)) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /licensedb/internal/processors/html2text.go: -------------------------------------------------------------------------------- 1 | package processors 2 | 3 | import ( 4 | "bytes" 5 | "regexp" 6 | "strconv" 7 | "strings" 8 | 9 | "golang.org/x/net/html" 10 | ) 11 | 12 | var ( 13 | skipHTMLRe = regexp.MustCompile(`^(head|script|style|object)$`) 14 | htmlHeaderRe = regexp.MustCompile(`^h[2-6]$`) 15 | htmlEntityRe = regexp.MustCompile(`&((#\\d+)|([a-zA-Z]+));`) 16 | marksRe = regexp.MustCompile(`[#$%*\/\\|><~\x60=!?.,:;\"'\])}-]`) 17 | ) 18 | 19 | func parseHTMLEntity(entName []byte) []byte { 20 | entNameStr := strings.ToLower(string(entName[1 : len(entName)-1])) 21 | 22 | if entNameStr[0] == '#' { 23 | val, err := strconv.Atoi(entNameStr[1:]) 24 | if err != nil { 25 | return entName 26 | } 27 | return []byte(string(rune(val))) 28 | } 29 | // the list is not full 30 | switch entNameStr { 31 | case "nbsp": 32 | return []byte(" ") 33 | case "gt": 34 | return []byte(">") 35 | case "lt": 36 | return []byte("<") 37 | case "amp": 38 | return []byte("&") 39 | case "quot": 40 | return []byte("\"") 41 | case "apos": 42 | return []byte("'") 43 | case "cent": 44 | return []byte("¢") 45 | case "pound": 46 | return []byte("£") 47 | case "yen": 48 | return []byte("¥") 49 | case "euro": 50 | return []byte("€") 51 | case "copy": 52 | return []byte("©") 53 | case "reg": 54 | return []byte("®") 55 | case "ldquo": 56 | return []byte("“") 57 | case "rdquo": 58 | return []byte("”") 59 | case "lsquo": 60 | return []byte("‘") 61 | case "rsquo": 62 | return []byte("’") 63 | case "sbquo": 64 | return []byte("‚") 65 | case "rbquo": 66 | return []byte("\"") 67 | case "bdquo": 68 | return []byte("„") 69 | case "ndash": 70 | return []byte("–") 71 | case "mdash": 72 | return []byte("—") 73 | case "bull": 74 | return []byte("•") 75 | case "hellip": 76 | return []byte("…") 77 | case "prime": 78 | return []byte("′") 79 | case "lsaquo": 80 | return []byte("‹") 81 | case "rsaquo": 82 | return []byte("›") 83 | case "trade": 84 | return []byte("™") 85 | case "minus": 86 | return []byte("−") 87 | case "raquo": 88 | return []byte("»") 89 | case "laquo": 90 | return []byte("«") 91 | case "deg": 92 | return []byte("°") 93 | case "sect": 94 | return []byte("§") 95 | case "iexcl": 96 | return []byte("¡") 97 | default: 98 | return entName 99 | } 100 | } 101 | 102 | // HTML converts HTML to plain text. E.g. it rips all the tags. 103 | func HTML(htmlSource []byte) []byte { 104 | result := &bytes.Buffer{} 105 | doc := html.NewTokenizer(bytes.NewReader(htmlSource)) 106 | skip := false 107 | var href []byte 108 | for token := doc.Next(); token != html.ErrorToken; token = doc.Next() { 109 | tagName, _ := doc.TagName() 110 | if skipHTMLRe.Match(tagName) { 111 | if doc.Token().Type != html.SelfClosingTagToken { 112 | skip = !skip 113 | } 114 | continue 115 | } 116 | if skip { 117 | continue 118 | } 119 | text := doc.Text() 120 | if href != nil && doc.Token().Type == html.TextToken { 121 | myhref := href 122 | href = nil 123 | if bytes.Equal(myhref, text) { 124 | continue 125 | } else { 126 | result.WriteRune(' ') 127 | } 128 | } 129 | text = htmlEntityRe.ReplaceAllFunc(text, parseHTMLEntity) 130 | text = bytes.Replace(text, []byte("\u00a0"), []byte(" "), -1) 131 | result.Write(text) 132 | strTagName := string(tagName) 133 | if strTagName == "br" { 134 | result.WriteRune('\n') 135 | } else if strTagName == "hr" { 136 | result.WriteString("---") 137 | } else if strTagName == "a" { 138 | for key, val, _ := doc.TagAttr(); key != nil; key, val, _ = doc.TagAttr() { 139 | if string(key) == "href" { 140 | result.Write(val) 141 | href = val 142 | break 143 | } 144 | } 145 | } else if htmlHeaderRe.Match(tagName) && doc.Token().Type == html.EndTagToken { 146 | if result.Len() > 0 && !marksRe.MatchString(string(result.Bytes()[result.Len()-1])) { 147 | result.WriteRune('.') 148 | } 149 | } 150 | } 151 | return result.Bytes() 152 | } 153 | -------------------------------------------------------------------------------- /licensedb/internal/processors/html2text_test.go: -------------------------------------------------------------------------------- 1 | package processors 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestHTML(t *testing.T) { 10 | text := `

Title

11 |

some text

12 |

Another title

13 |
14 |

And a third one.

15 |   > < & " ' ¢ £ ¥ € © ® 16 | “ ” ‘ ’ ‚ &rbquo; „ – — • 17 | … ′ ‹ › ™ − » « ° § 18 | ¡ 19 | &nbsp; &gt; &lt; &amp; &quot; &apos; &cent; &pound; &yen; &euro; &copy; &reg; 20 | &ldquo; &rdquo; &lsquo; &rsquo; &sbquo; &rbquo; &bdquo; &ndash; &mdash; &bull; 21 | &hellip; &prime; &lsaquo; &rsaquo; &trade; &minus; &raquo; &laquo; &deg; &sect; 22 | &iexcl; 23 | blah blah LINK text proceeds
24 | hello 25 | 26 | http://foo` 27 | assert.Equal(t, `Title 28 | some text 29 | Another title. 30 | --- 31 | And a third one. 32 | > < & " ' ¢ £ ¥ € © ® 33 | “ ” ‘ ’ ‚ " „ – — • 34 | … ′ ‹ › ™ − » « ° § 35 | ¡ 36 | > < & " ' ¢ £ ¥ € © ® 37 | “ ” ‘ ’ ‚ " „ – — • 38 | … ′ ‹ › ™ − » « ° § 39 | ¡ 40 | blah blah wow:// LINK text proceeds 41 | 42 | hello 43 | 44 | http://foo`, string(HTML([]byte(text)))) 45 | } 46 | -------------------------------------------------------------------------------- /licensedb/internal/processors/markup.go: -------------------------------------------------------------------------------- 1 | package processors 2 | 3 | import ( 4 | "bytes" 5 | "sync" 6 | 7 | rst "github.com/hhatto/gorst" 8 | "github.com/russross/blackfriday/v2" 9 | ) 10 | 11 | var ( 12 | parserLock sync.Mutex 13 | ) 14 | 15 | // Markdown converts Markdown to plain text. It tries to revert all the decorations. 16 | func Markdown(text []byte) []byte { 17 | html := blackfriday.Run(text) 18 | // Repeat to times to heal broken HTML 19 | return HTML(html) 20 | } 21 | 22 | // RestructuredText converts ReStructuredText to plain text. 23 | // It tries to revert all the decorations. 24 | func RestructuredText(text []byte) []byte { 25 | parserLock.Lock() 26 | defer parserLock.Unlock() 27 | parser := rst.NewParser(nil) 28 | input := bytes.NewBuffer(text) 29 | output := &bytes.Buffer{} 30 | parser.ReStructuredText(input, rst.ToHTML(output)) 31 | // Repeat to times to heal broken HTML 32 | return HTML(output.Bytes()) 33 | } 34 | -------------------------------------------------------------------------------- /licensedb/internal/wmh/test_data/wmh.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/go-enry/go-license-detector/e0d6f0187f3a3aaeb8236f9860337ffb92438723/licensedb/internal/wmh/test_data/wmh.bin -------------------------------------------------------------------------------- /licensedb/internal/wmh/wmh.go: -------------------------------------------------------------------------------- 1 | package wmh 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "log" 7 | "math" 8 | "reflect" 9 | "unsafe" 10 | 11 | "github.com/go-enry/go-license-detector/v4/licensedb/internal/fastlog" 12 | "golang.org/x/exp/rand" 13 | "gonum.org/v1/gonum/stat/distuv" 14 | ) 15 | 16 | const maxUint16 = 65535 17 | 18 | // WeightedMinHasher calculates Weighted MinHash-es. 19 | // https://ekzhu.github.io/datasketch/weightedminhash.html 20 | type WeightedMinHasher struct { 21 | // Size of each hash element in bits. Supported values are 16, 32 and 64. 22 | Bitness int 23 | 24 | dim int 25 | sampleSize int 26 | rs [][]float32 27 | lnCs [][]float32 28 | betas [][]uint16 // attempt to save some memory - [0, 1] is scaled to maxUint16 29 | } 30 | 31 | // NewWeightedMinHasher initializes a new instance of WeightedMinHasher. 32 | // `dim` is the bag size. 33 | // `sampleSize` is the hash length. 34 | // `seed` is the random generator seed, as Weighted MinHash is probabilistic. 35 | func NewWeightedMinHasher(dim int, sampleSize int, seed int64) *WeightedMinHasher { 36 | randSrc := rand.New(rand.NewSource(uint64(seed))) 37 | gammaGen := distuv.Gamma{Alpha: 2, Beta: 1, Src: randSrc} 38 | hasher := &WeightedMinHasher{Bitness: 64, dim: dim, sampleSize: sampleSize} 39 | hasher.rs = make([][]float32, sampleSize) 40 | for y := 0; y < sampleSize; y++ { 41 | arr := make([]float32, dim) 42 | hasher.rs[y] = arr 43 | for x := 0; x < dim; x++ { 44 | arr[x] = float32(gammaGen.Rand()) 45 | } 46 | } 47 | hasher.lnCs = make([][]float32, sampleSize) 48 | for y := 0; y < sampleSize; y++ { 49 | arr := make([]float32, dim) 50 | hasher.lnCs[y] = arr 51 | for x := 0; x < dim; x++ { 52 | arr[x] = fastlog.Log(float32(gammaGen.Rand())) 53 | } 54 | } 55 | uniformGen := distuv.Uniform{Min: 0, Max: 1, Src: randSrc} 56 | hasher.betas = make([][]uint16, sampleSize) 57 | for y := 0; y < sampleSize; y++ { 58 | arr := make([]uint16, dim) 59 | hasher.betas[y] = arr 60 | for x := 0; x < dim; x++ { 61 | arr[x] = uint16(uniformGen.Rand() * maxUint16) 62 | } 63 | } 64 | return hasher 65 | } 66 | 67 | // MarshalBinary serializes the WeightedMinHasher. 68 | func (wmh *WeightedMinHasher) MarshalBinary() (data []byte, err error) { 69 | data = make([]byte, 9+wmh.sampleSize*wmh.dim*(4*2+2)) 70 | data[0] = byte(wmh.Bitness) 71 | binary.LittleEndian.PutUint32(data[1:5], uint32(wmh.dim)) 72 | binary.LittleEndian.PutUint32(data[5:9], uint32(wmh.sampleSize)) 73 | offset := 9 74 | writeFloat32Slice := func(arr []float32) { 75 | header := (*reflect.SliceHeader)(unsafe.Pointer(&arr)) 76 | header.Len *= 4 77 | header.Cap *= 4 78 | buffer := *(*[]byte)(unsafe.Pointer(header)) 79 | copy(data[offset:], buffer) 80 | offset += len(buffer) 81 | } 82 | for _, arr := range wmh.rs { 83 | writeFloat32Slice(arr) 84 | } 85 | for _, arr := range wmh.lnCs { 86 | writeFloat32Slice(arr) 87 | } 88 | for _, arr := range wmh.betas { 89 | header := (*reflect.SliceHeader)(unsafe.Pointer(&arr)) 90 | header.Len *= 2 91 | header.Cap *= 2 92 | buffer := *(*[]byte)(unsafe.Pointer(header)) 93 | copy(data[offset:], buffer) 94 | offset += len(buffer) 95 | } 96 | return data, nil 97 | } 98 | 99 | // UnmarshalBinary reads a WeightedMinHasher previously serialized with MarshalBinary(). 100 | func (wmh *WeightedMinHasher) UnmarshalBinary(data []byte) error { 101 | if len(data) < 9 { 102 | return errors.New("invalid binary format: no header") 103 | } 104 | wmh.Bitness = int(data[0]) 105 | wmh.dim = int(binary.LittleEndian.Uint32(data[1:5])) 106 | wmh.sampleSize = int(binary.LittleEndian.Uint32(data[5:9])) 107 | if len(data)-9 != wmh.sampleSize*wmh.dim*(4*2+2) { 108 | return errors.New("invalid binary format: body size mismatch") 109 | } 110 | wmh.rs = make([][]float32, wmh.sampleSize) 111 | wmh.lnCs = make([][]float32, wmh.sampleSize) 112 | wmh.betas = make([][]uint16, wmh.sampleSize) 113 | readFloat32Slice := func(dest []float32, src []byte) { 114 | header := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 115 | header.Len /= 4 116 | header.Cap /= 4 117 | buffer := *(*[]float32)(unsafe.Pointer(header)) 118 | copy(dest, buffer) 119 | } 120 | offset := 9 121 | for i := range wmh.rs { 122 | wmh.rs[i] = make([]float32, wmh.dim) 123 | nextOffset := offset + wmh.dim*4 124 | readFloat32Slice(wmh.rs[i], data[offset:nextOffset]) 125 | offset = nextOffset 126 | } 127 | for i := range wmh.lnCs { 128 | wmh.lnCs[i] = make([]float32, wmh.dim) 129 | nextOffset := offset + wmh.dim*4 130 | readFloat32Slice(wmh.lnCs[i], data[offset:nextOffset]) 131 | offset = nextOffset 132 | } 133 | for i := range wmh.betas { 134 | wmh.betas[i] = make([]uint16, wmh.dim) 135 | nextOffset := offset + wmh.dim*2 136 | slice := data[offset:nextOffset] 137 | header := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) 138 | header.Len /= 2 139 | header.Cap /= 2 140 | buffer := *(*[]uint16)(unsafe.Pointer(header)) 141 | copy(wmh.betas[i], buffer) 142 | offset = nextOffset 143 | } 144 | return nil 145 | } 146 | 147 | // Hash calculates the Weighted MinHash from the weighted bag of features. 148 | // Each feature has an index and a value. 149 | func (wmh *WeightedMinHasher) Hash(values []float32, indices []int) []uint64 { 150 | if len(values) != len(indices) { 151 | log.Panicf("len(values)=%d is not equal to len(indices)=%d", len(values), len(indices)) 152 | } 153 | for i, v := range values { 154 | if v < 0 { 155 | log.Panicf("negative value in the vector: %f @ %d", v, i) 156 | } 157 | } 158 | for vi, j := range indices { 159 | if j >= wmh.dim { 160 | log.Panicf("index is out of range: %d @ %d", j, vi) 161 | } 162 | } 163 | hashvalues := make([]uint64, wmh.sampleSize) 164 | for s := 0; s < wmh.sampleSize; s++ { 165 | minLnA := float32(math.MaxFloat32) 166 | var k int 167 | var minT float32 168 | for vi, j := range indices { 169 | vlog := fastlog.Log(values[vi]) 170 | beta := float32(wmh.betas[s][j]) / float32(maxUint16) 171 | // t = np.floor((vlog / self.rs[i]) + self.betas[i]) 172 | t := float32(math.Floor(float64(vlog/wmh.rs[s][j] + beta))) 173 | // ln_y = (t - self.betas[i]) * self.rs[i] 174 | lnY := (t - beta) * wmh.rs[s][j] 175 | // ln_a = self.ln_cs[i] - ln_y - self.rs[i] 176 | lnA := wmh.lnCs[s][j] - lnY - wmh.rs[s][j] 177 | // k = np.nanargmin(ln_a) 178 | if lnA < minLnA { 179 | minLnA = lnA 180 | k = j 181 | minT = t 182 | } 183 | } 184 | // hashvalues[i][0], hashvalues[i][1] = k, int(t[k]) 185 | switch wmh.Bitness { 186 | case 64: 187 | hashvalues[s] = uint64(uint64(k) | (uint64(minT) << 32)) 188 | case 32: 189 | hashvalues[s] = uint64(uint32(k) | (uint32(minT) << 16)) 190 | case 16: 191 | hashvalues[s] = uint64(uint16(k) | (uint16(minT) << 8)) 192 | default: 193 | log.Fatalf("unsupported bitness value: %d", wmh.Bitness) 194 | } 195 | } 196 | return hashvalues 197 | } 198 | -------------------------------------------------------------------------------- /licensedb/internal/wmh/wmh_test.go: -------------------------------------------------------------------------------- 1 | package wmh 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestWMHSerialize(t *testing.T) { 10 | hasher := NewWeightedMinHasher(100, 50, 7) 11 | bytes, err := hasher.MarshalBinary() 12 | assert.Nil(t, err) 13 | newHasher := &WeightedMinHasher{} 14 | err = newHasher.UnmarshalBinary(bytes) 15 | assert.Nil(t, err) 16 | assert.Equal(t, hasher.Bitness, newHasher.Bitness) 17 | assert.Equal(t, hasher.dim, newHasher.dim) 18 | assert.Equal(t, hasher.sampleSize, newHasher.sampleSize) 19 | assert.Equal(t, hasher.rs, newHasher.rs) 20 | assert.Equal(t, hasher.lnCs, newHasher.lnCs) 21 | assert.Equal(t, hasher.betas, newHasher.betas) 22 | } 23 | 24 | func TestWMHHash(t *testing.T) { 25 | hasher := NewWeightedMinHasher(100, 50, 7) 26 | assert.NotNil(t, hasher) 27 | hasher.Bitness = 32 28 | hash := hasher.Hash([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 29 | []int{0, 10, 20, 30, 40, 50, 60, 70, 80, 90}) 30 | /* 31 | import numpy, datasketch 32 | gen = datasketch.WeightedMinHashGenerator(100, 50, 7) 33 | with open("test_data/wmh.bin", "rb") as fin: 34 | fin.read(9) 35 | gen.rs = numpy.frombuffer(fin.read(100*50*4), dtype=numpy.float32).reshape(50, 100) 36 | gen.ln_cs = numpy.frombuffer(fin.read(100*50*4), dtype=numpy.float32).reshape(50, 100) 37 | betas = numpy.frombuffer(fin.read(100*50*2), dtype=numpy.uint16) 38 | gen.betas = (betas / ((1 << 16) - 1)).astype(numpy.float32).reshape(50, 100) 39 | v = numpy.zeros(100, numpy.float32) 40 | for i, ii in enumerate([0, 10, 20, 30, 40, 50, 60, 70, 80, 90]): 41 | v[ii] = i + 1 42 | mh = gen.minhash(v) 43 | for h in mh.hashvalues: 44 | print("%d," % (h[0] | (h[1] << 16))) 45 | */ 46 | truth := []uint64{ 47 | 65586, 48 | 0, 49 | 65626, 50 | 65616, 51 | 65626, 52 | 30, 53 | 65616, 54 | 90, 55 | 40, 56 | 65576, 57 | 65596, 58 | 65586, 59 | 65626, 60 | 65626, 61 | 589884, 62 | 20, 63 | 65616, 64 | 65626, 65 | 65596, 66 | 65626, 67 | 262234, 68 | 131152, 69 | 65596, 70 | 65596, 71 | 65556, 72 | 65626, 73 | 65576, 74 | 65606, 75 | 65626, 76 | 65606, 77 | 10, 78 | 90, 79 | 65596, 80 | 65586, 81 | 65626, 82 | 65606, 83 | 65626, 84 | 0, 85 | 131162, 86 | 65626, 87 | 65576, 88 | 65626, 89 | 65616, 90 | 65606, 91 | 65606, 92 | 131152, 93 | 65566, 94 | 65626, 95 | 65586, 96 | 65626, 97 | } 98 | assert.Equal(t, truth, hash) 99 | } 100 | 101 | func TestWMHTrash(t *testing.T) { 102 | hasher := NewWeightedMinHasher(100, 50, 7) 103 | assert.Panics(t, func() { 104 | hasher.Hash([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9}, 105 | []int{0, 10, 20, 30, 40, 50, 60, 70, 80, 90}) 106 | }) 107 | assert.Panics(t, func() { 108 | hasher.Hash([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 109 | []int{0, 10, 20, 30, 40, 50, 60, 70, 80}) 110 | }) 111 | assert.Panics(t, func() { 112 | hasher.Hash([]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 113 | []int{0, 10, 20, 30, 40, 50, 60, 70, 80, 100}) 114 | }) 115 | } 116 | -------------------------------------------------------------------------------- /licensedb/licensedb.go: -------------------------------------------------------------------------------- 1 | package licensedb 2 | 3 | import ( 4 | "errors" 5 | paths "path" 6 | 7 | "github.com/go-enry/go-license-detector/v4/licensedb/api" 8 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 9 | "github.com/go-enry/go-license-detector/v4/licensedb/internal" 10 | ) 11 | 12 | var ( 13 | // ErrNoLicenseFound is raised if no license files were found. 14 | ErrNoLicenseFound = errors.New("no license file was found") 15 | // ErrUnknownLicenseID is raised if license identifier is not known. 16 | // Probably you need to upgrade version of the SPDX. 17 | ErrUnknownLicenseID = errors.New("license id is not known") 18 | ) 19 | 20 | // Detect returns the most probable reference licenses matched for the given 21 | // file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. 22 | func Detect(fs filer.Filer) (map[string]api.Match, error) { 23 | files, err := fs.ReadDir("") 24 | if err != nil { 25 | return nil, err 26 | } 27 | fileNames := []string{} 28 | for _, file := range files { 29 | if !file.IsDir { 30 | fileNames = append(fileNames, file.Name) 31 | } else if internal.IsLicenseDirectory(file.Name) { 32 | // "license" directory, let's look inside 33 | subfiles, err := fs.ReadDir(file.Name) 34 | if err == nil { 35 | for _, subfile := range subfiles { 36 | if !subfile.IsDir { 37 | fileNames = append(fileNames, paths.Join(file.Name, subfile.Name)) 38 | } 39 | } 40 | } 41 | } 42 | } 43 | candidates := internal.ExtractLicenseFiles(fileNames, fs) 44 | licenses := internal.InvestigateLicenseTexts(candidates) 45 | if len(licenses) > 0 { 46 | return licenses, nil 47 | } 48 | // Plan B: take the README, find the section about the license and apply NER 49 | candidates = internal.ExtractReadmeFiles(fileNames, fs) 50 | if len(candidates) == 0 { 51 | return nil, ErrNoLicenseFound 52 | } 53 | licenses = internal.InvestigateReadmeTexts(candidates, fs) 54 | if len(licenses) == 0 { 55 | return nil, ErrNoLicenseFound 56 | } 57 | return licenses, nil 58 | } 59 | 60 | // Preload database with licenses - load internal database from assets into memory. 61 | // This method is an optimization for cases when the `Detect` method should return fast, 62 | // e.g. in HTTP web servers where connection timeout can occur during detect 63 | // `Preload` method could be called before server startup. 64 | // This method os optional and it's not required to be called, other APIs loads license database 65 | // lazily on first invocation. 66 | func Preload() { 67 | internal.Preload() 68 | } 69 | 70 | // LicenseURLs returns the list of the URLs for the given license identifier 71 | func LicenseURLs(id string) ([]string, error) { 72 | urls, err := internal.LookupURLs(id) 73 | if err != nil { 74 | if errors.Is(err, internal.ErrUnknownLicenseID) { 75 | return nil, ErrUnknownLicenseID 76 | } 77 | return nil, err 78 | } 79 | return urls, nil 80 | } 81 | 82 | // LicenseName returns the name for the given license identifier 83 | func LicenseName(id string) (string, error) { 84 | name, err := internal.LookupName(id) 85 | if err != nil { 86 | if errors.Is(err, internal.ErrUnknownLicenseID) { 87 | return "", ErrUnknownLicenseID 88 | } 89 | return "", err 90 | } 91 | return name, nil 92 | } 93 | -------------------------------------------------------------------------------- /licensedb/licensedb_test.go: -------------------------------------------------------------------------------- 1 | package licensedb 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | "github.com/stretchr/testify/require" 10 | 11 | "github.com/go-enry/go-license-detector/v4/licensedb/filer" 12 | ) 13 | 14 | func BenchmarkDetect(b *testing.B) { 15 | f := pwdFiler() 16 | b.ResetTimer() 17 | for i := 0; i < b.N; i++ { 18 | _, err := Detect(f) 19 | if err != nil { 20 | panic(err) 21 | } 22 | } 23 | } 24 | 25 | func BenchmarkDetectWithPreload(b *testing.B) { 26 | f := pwdFiler() 27 | Preload() 28 | b.ResetTimer() 29 | for i := 0; i < b.N; i++ { 30 | _, err := Detect(f) 31 | if err != nil { 32 | panic(err) 33 | } 34 | } 35 | } 36 | 37 | func pwdFiler() filer.Filer { 38 | pwd, err := os.Getwd() 39 | if err != nil { 40 | panic(err) 41 | } 42 | root := filepath.Dir(pwd) 43 | f, err := filer.FromDirectory(root) 44 | if err != nil { 45 | panic(err) 46 | } 47 | return f 48 | } 49 | 50 | func TestLicenseURLs(t *testing.T) { 51 | t.Run("existing license", func(t *testing.T) { 52 | res, err := LicenseURLs("ODbL-1.0") 53 | require.NoError(t, err) 54 | assert.Equal(t, []string{"http://www.opendatacommons.org/licenses/odbl/1.0/", "https://opendatacommons.org/licenses/odbl/1-0/"}, res) 55 | }) 56 | 57 | t.Run("not existing license", func(t *testing.T) { 58 | _, err := LicenseURLs("bad-license-key") 59 | require.Equal(t, ErrUnknownLicenseID, err) 60 | }) 61 | } 62 | 63 | func TestLicenseName(t *testing.T) { 64 | t.Run("existing license", func(t *testing.T) { 65 | res, err := LicenseName("ODbL-1.0") 66 | require.NoError(t, err) 67 | assert.Equal(t, "Open Data Commons Open Database License v1.0", res) 68 | }) 69 | 70 | t.Run("not existing license", func(t *testing.T) { 71 | _, err := LicenseName("bad-license-key") 72 | require.Equal(t, ErrUnknownLicenseID, err) 73 | }) 74 | } 75 | -------------------------------------------------------------------------------- /licensedb/scan_file_content.go: -------------------------------------------------------------------------------- 1 | package licensedb 2 | 3 | import ( 4 | "github.com/go-enry/go-license-detector/v4/licensedb/internal" 5 | ) 6 | 7 | // InvestigateLicenseText takes the license text and returns the most probable reference licenses matched. 8 | // Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. 9 | func InvestigateLicenseText(text []byte) map[string]float32 { 10 | return internal.InvestigateLicenseText(text) 11 | } 12 | --------------------------------------------------------------------------------