├── .editorconfig ├── .github └── workflows │ ├── CI_build.yml │ └── pypi_publish.yml ├── .gitignore ├── .prospector.yml ├── .readthedocs.yml ├── .zenodo.json ├── CHANGELOG.md ├── CITATION.cff ├── CODE_OF_CONDUCT.rst ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── conda ├── README.md ├── environment-build.yml ├── environment-dev.yml ├── environment.yml └── meta.yaml ├── integration-tests ├── test_user_workflow_spec2vec.model └── test_user_workflow_spec2vec.py ├── readthedocs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── setup.cfg ├── setup.py ├── sonar-project.properties ├── spec2vec ├── Document.py ├── Spec2Vec.py ├── SpectrumDocument.py ├── __init__.py ├── __version__.py ├── logging_functions.py ├── model_building.py ├── serialization │ ├── __init__.py │ ├── model_exporting.py │ └── model_importing.py ├── utils.py └── vector_operations.py └── tests ├── __init__.py ├── conftest.py ├── data ├── model.json ├── pesticides.mgf └── weights.npy ├── test_document.py ├── test_logging.py ├── test_model_building.py ├── test_model_serialization.py ├── test_spec2vec.py ├── test_spectrum_document.py ├── test_vector_operations.py └── test_version_string_consistency.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | charset = utf-8 12 | 13 | # 4 space indentation 14 | [*.{py,java,r,R}] 15 | indent_style = space 16 | indent_size = 4 17 | 18 | # 2 space indentation 19 | [*.{js,json,y{a,}ml,html,cwl}] 20 | indent_style = space 21 | indent_size = 2 22 | 23 | [*.{md,Rmd,rst}] 24 | trim_trailing_whitespace = false 25 | indent_style = space 26 | indent_size = 2 27 | -------------------------------------------------------------------------------- /.github/workflows/CI_build.yml: -------------------------------------------------------------------------------- 1 | name: CI Build 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | jobs: 9 | 10 | first_check: 11 | name: first code check / python-3.8 / ubuntu-latest 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: 3.8 19 | - name: Python info 20 | run: | 21 | which python 22 | python --version 23 | - name: Build package and create dev environment 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -e .[dev] 27 | - name: Show pip list 28 | run: | 29 | pip list 30 | - name: Test with coverage 31 | run: | 32 | pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml 33 | - name: Correct coverage paths 34 | run: sed -i "s+$PWD/++g" coverage.xml 35 | - name: Check style against standards using prospector 36 | shell: bash -l {0} 37 | run: prospector -o grouped -o pylint:pylint-report.txt 38 | - name: Check whether import statements are used consistently 39 | shell: bash -l {0} 40 | run: isort --check-only --diff --conda-env spec2vec-dev . 41 | - name: SonarCloud Scan 42 | if: github.repository == 'iomega/spec2vec' 43 | uses: sonarsource/sonarcloud-github-action@master 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} 47 | 48 | build_pypi: 49 | name: Pypi and documentation build / python-${{ matrix.python-version }} / ${{ matrix.os }} 50 | runs-on: ${{ matrix.os }} 51 | needs: first_check 52 | strategy: 53 | fail-fast: false 54 | matrix: 55 | os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] 56 | python-version: ['3.7', '3.8', '3.9'] 57 | exclude: 58 | # already tested in first_check job 59 | - python-version: 3.8 60 | os: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v2 63 | - name: Set up Python ${{ matrix.python-version }} 64 | uses: actions/setup-python@v4 65 | with: 66 | python-version: ${{ matrix.python-version }} 67 | - name: Python info 68 | run: | 69 | which python 70 | python --version 71 | - name: Install dependencies 72 | run: | 73 | python -m pip install --upgrade pip 74 | - name: Build package 75 | run: | 76 | pip install wheel twine 77 | python setup.py sdist bdist_wheel 78 | - name: Test package 79 | run: | 80 | python -m twine check dist/* 81 | - name: Show pip list 82 | run: | 83 | pip list 84 | - name: Install development dependencies 85 | run: | 86 | pip install -e .[dev] 87 | - name: Test 88 | run: | 89 | pytest 90 | - name: Show environment variables 91 | shell: bash -l {0} 92 | run: | 93 | env | sort 94 | - name: Build documentation 95 | shell: bash -l {0} 96 | run: | 97 | make coverage doctest html 98 | working-directory: readthedocs/ 99 | env: 100 | SPHINXOPTS: "-n" # enable nit-picky mode 101 | - name: Check documentation coverage threshold 102 | if: matrix.os == 'ubuntu-latest' 103 | run: | 104 | cat readthedocs/_build/coverage/python.txt 105 | UNCOVERED_MEMBERS=$(grep '*' readthedocs/_build/coverage/python.txt | wc -l) 106 | UNCOVERED_MEMBERS_ALLOWED=5 107 | if (( $UNCOVERED_MEMBERS > $UNCOVERED_MEMBERS_ALLOWED )) ; then echo "There are currently ${UNCOVERED_MEMBERS} uncovered members in the documentation, which is more than the ${UNCOVERED_MEMBERS_ALLOWED} allowed." && exit 1;fi 108 | echo "The code is sufficiently documented with ${UNCOVERED_MEMBERS} uncovered members out of ${UNCOVERED_MEMBERS_ALLOWED} allowed."; 109 | 110 | anaconda_build: 111 | name: Anaconda build / python-3.7 / ubuntu-latest 112 | runs-on: ubuntu-latest 113 | strategy: 114 | fail-fast: false 115 | needs: first_check 116 | steps: 117 | - uses: actions/checkout@v2 118 | with: 119 | fetch-depth: "0" 120 | - name: Create spec2vec-build environment 121 | uses: conda-incubator/setup-miniconda@v2 122 | with: 123 | activate-environment: spec2vec-build 124 | auto-update-conda: true 125 | environment-file: conda/environment-build.yml 126 | python-version: 3.8 127 | - name: Show conda config 128 | shell: bash -l {0} 129 | run: | 130 | conda info 131 | conda list 132 | conda config --show-sources 133 | conda config --show 134 | conda env list 135 | - name: Python info 136 | shell: bash -l {0} 137 | run: | 138 | which python 139 | python --version 140 | - name: Show environment variables 141 | shell: bash -l {0} 142 | run: | 143 | env | sort 144 | - name: Build the conda package 145 | shell: bash -l {0} 146 | run: | 147 | export BUILDDIR=$RUNNER_TEMP/spec2vec/_build 148 | [ "$RUNNING_OS" = "Windows" ] && export BUILDDIR=$RUNNER_TEMP\\spec2vec\\_build\\ 149 | conda config --set anaconda_upload no 150 | conda build --no-include-recipe \ 151 | --channel bioconda --channel conda-forge \ 152 | --croot ${BUILDDIR} \ 153 | ./conda 154 | - name: Upload package artifact from build 155 | uses: actions/upload-artifact@v2 156 | with: 157 | name: conda-package-artifact 158 | path: ${{ runner.temp }}/spec2vec/_build 159 | -------------------------------------------------------------------------------- /.github/workflows/pypi_publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: 3.7 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | python setup.py sdist bdist_wheel 21 | - name: Publish package 22 | uses: pypa/gh-action-pypi-publish@release/v1 23 | with: 24 | user: __token__ 25 | password: ${{ secrets.PYPI_TOKEN }} 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.egg-info 3 | *.eggs 4 | .ipynb_checkpoints 5 | 6 | build 7 | dist 8 | .cache 9 | __pycache__ 10 | 11 | htmlcov 12 | .coverage 13 | coverage.xml 14 | .pytest_cache 15 | pylint-report.txt 16 | xunit-result.xml 17 | .scannerwork/ 18 | 19 | docs/_build 20 | docs/apidocs 21 | 22 | # ide 23 | .idea 24 | .eclipse 25 | .vscode 26 | 27 | # Mac 28 | .DS_Store 29 | config.py 30 | output/ 31 | /data/ 32 | models_trained/ 33 | computed_results/ 34 | notebooks/.ipynb_checkpoints/ 35 | __pycache__/ 36 | 37 | 38 | # conda build directory 39 | /_build 40 | -------------------------------------------------------------------------------- /.prospector.yml: -------------------------------------------------------------------------------- 1 | # prospector configuration file 2 | 3 | --- 4 | 5 | output-format: grouped 6 | 7 | strictness: medium 8 | doc-warnings: false 9 | test-warnings: true 10 | member-warnings: false 11 | 12 | ignore-paths: 13 | - readthedocs 14 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | builder: html 11 | configuration: readthedocs/conf.py 12 | 13 | python: 14 | version: 3.7 15 | install: 16 | - method: pip 17 | path: . 18 | conda: 19 | environment: conda/environment-dev.yml 20 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": "Netherlands eScience Center", 5 | "name": "Huber, Florian", 6 | "orcid": "0000-0002-3535-9406" 7 | }, 8 | { 9 | "affiliation": "Wageningen University and Research", 10 | "name": "van der Hooft, Justin J. J.", 11 | "orcid": "0000-0002-9340-5511" 12 | }, 13 | { 14 | "affiliation": "Netherlands eScience Center", 15 | "name": "Spaaks, Jurriaan H.", 16 | "orcid": "0000-0002-7064-4069" 17 | }, 18 | { 19 | "affiliation": "Netherlands eScience Center", 20 | "name": "Diblen, Faruk", 21 | "orcid": "0000-0002-0989-929X" 22 | }, 23 | { 24 | "affiliation": "Netherlands eScience Center", 25 | "name": "Verhoeven, Stefan", 26 | "orcid": "0000-0002-5821-2060" 27 | }, 28 | { 29 | "affiliation": "Netherlands eScience Center", 30 | "name": "de Jonge, Niek", 31 | "orcid": "0000-0002-3054-6210" 32 | }, 33 | { 34 | "affiliation": "Netherlands eScience Center", 35 | "name": "Geng, Cunliang", 36 | "orcid": "0000-0002-1409-8358" 37 | }, 38 | { 39 | "affiliation": "Netherlands eScience Center", 40 | "name": "Meijer, Christiaan", 41 | "orcid": "0000-0002-5529-5761" 42 | }, 43 | { 44 | "affiliation": "University of Glasgow", 45 | "name": "Rogers, Simon", 46 | "orcid": "0000-0003-3578-4477" 47 | }, 48 | { 49 | "affiliation": "Netherlands eScience Center", 50 | "name": "Belloum, Adam", 51 | "orcid": "0000-0001-6306-6937" 52 | }, 53 | { 54 | "affiliation": "Netherlands eScience Center", 55 | "name": "Spreeuw, Hanno", 56 | "orcid": "0000-0002-5057-0322" 57 | } 58 | { 59 | "affiliation": "ICS, Masaryk University", 60 | "name": "Skoryk, Maksym", 61 | "orcid": "0000-0003-2056-8018" 62 | } 63 | ], 64 | "description": "Word2Vec based similarity measure of mass spectrometry data.", 65 | "keywords": [ 66 | "Word2Vec", 67 | "similarity measures", 68 | "mass spectrometry" 69 | ], 70 | "license": { 71 | "id": "Apache-2.0" 72 | }, 73 | "title": "spec2vec" 74 | } 75 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [0.8.1] - 2024-08-06 11 | ### Changed 12 | - Set max matchms to 0.26.4 13 | - Set max scipy to 1.10.1 14 | 15 | ## [0.8.0] - 2022-01-06 16 | 17 | ### Changed 18 | 19 | - Minor changes to make tests pass with new matchms versions (>=0.18.0). Should nearly always be backwards compatible though. 20 | - Now dependency requirement is set to `matchms>=0.14.0` 21 | 22 | ## [0.7.0] - 2022-10-01 23 | 24 | ### Added 25 | 26 | - added `spec2vec.serialization` subpackage to import and export `Word2Vec` models to/from disk without Pickle 27 | (via `import_model` and `export_model` respectively) [#80](https://github.com/iomega/spec2vec/pull/80) 28 | 29 | ### Changed 30 | 31 | - bumped **gensim** version to `>=4.2.0` in dependencies [#84](https://github.com/iomega/spec2vec/pull/84) 32 | 33 | ### Fixed 34 | 35 | - updated Code examples in documentation to recent changes in matchms. 36 | 37 | ## [0.6.0] - 2022-01-03 38 | 39 | ### Added 40 | 41 | - Logging (replacing former print statements) including options to write logs to file [#73](https://github.com/iomega/spec2vec/pull/73) 42 | - Now supports Python 3.9 (including CI test runs) [#40](https://github.com/iomega/spec2vec/issues/40) 43 | 44 | ### Changed 45 | 46 | - missing words percentage above the `allowed_missing_percentage` no longer causes an expection but only leads to raising a warning [#73](https://github.com/iomega/spec2vec/pull/73) 47 | - default setting for `allowed_missing_percentage` to 10.0 to be less strict on model coverage [#72](https://github.com/iomega/spec2vec/pull/72) 48 | 49 | ### Fixed 50 | 51 | - Can now also handle spectra in which no peak is known to the model (will return warning + empty vector) [#73](https://github.com/iomega/spec2vec/pull/73) 52 | 53 | ## [0.5.0] - 2021-06-18 54 | 55 | ### Changed 56 | 57 | - Spec2Vec is now using gensim >= 4.0.0 [#62](https://github.com/iomega/spec2vec/pull/62) 58 | 59 | ## [0.4.0] - 2021-02-10 60 | 61 | ### Changed 62 | 63 | - refactored `Spec2Vec` to now accept `Spectrum` or `SpectrumDocument` as input [#51](https://github.com/iomega/spec2vec/issues/51) 64 | 65 | ### Fixed 66 | 67 | - updated and fixed code examples [#51](https://github.com/iomega/spec2vec/issues/51) 68 | - updated and fixed attribute typing [#51](https://github.com/iomega/spec2vec/issues/51) 69 | 70 | ## [0.3.4] - 2021-02-10 71 | 72 | ### Changed 73 | 74 | - update required numba version to >=0.51 to avoid issues between numba and numpy [#55](https://github.com/iomega/spec2vec/pull/55) 75 | 76 | ## [0.3.3] - 2021-02-09 77 | 78 | ### Added 79 | 80 | - Metadata getter method for `SpectrumDocument` [#50](https://github.com/iomega/spec2vec/pull/50) 81 | - Implement `is_symmetric=True` option for `Spec2Vec.matrix` method [#53](https://github.com/iomega/spec2vec/pull/53) 82 | 83 | ### Changed 84 | 85 | - Change default for `n_decimals` parameter from 1 to 2 [#50](https://github.com/iomega/spec2vec/pull/50) 86 | 87 | ## [0.3.2] - 2020-12-03 88 | 89 | ### Changed 90 | 91 | - Add optional progress bar for spec2vec.matrix() calculations (default is False) [#43](https://github.com/iomega/spec2vec/pull/43) 92 | 93 | ## [0.3.1] - 2020-09-23 94 | 95 | ### Changed 96 | 97 | - Implement faster, numba-based cosine similarity function [#29](https://github.com/iomega/spec2vec/pull/29) 98 | 99 | ## [0.3.0] - 2020-09-16 100 | 101 | ### Added 102 | 103 | - Support for Python 3.8 [#35](https://github.com/iomega/spec2vec/pull/35) 104 | 105 | ### Changed 106 | 107 | - Refactored Spec2Vec class to provide .pair() and .matrix() methods [#35](https://github.com/iomega/spec2vec/pull/35) 108 | 109 | ### Removed 110 | 111 | - Spec2VecParallel (is now included as Spec2Vec.matrix()) [#35](https://github.com/iomega/spec2vec/pull/35) 112 | 113 | ## [0.2.0] - 2020-06-18 114 | 115 | ### Added 116 | 117 | - Wrapper for training a gensim word2vec model [#13](https://github.com/iomega/spec2vec/tree/13-gensim-wrapper) 118 | - Basic logger for word2vec model training [#11](https://github.com/iomega/spec2vec/issues/11) 119 | 120 | ### Changed 121 | 122 | - Extend spec2vec similarity calculation to handle missing words [#9](https://github.com/iomega/spec2vec/issues/9) 123 | - Extend documentation and given code examples [#15](https://github.com/iomega/spec2vec/issues/15) 124 | - Updated the integration test to work with matchms 0.4.0 [#7](https://github.com/iomega/spec2vec/issues/7) 125 | 126 | ## [0.1.0] - 2020-06-02 127 | 128 | ### Added 129 | 130 | - Matchms as dependency [#4](https://github.com/iomega/spec2vec/pull/4) 131 | - Bump2version config 132 | 133 | ### Changed 134 | 135 | - Splitted spec2vec from [matchms]. See (https://github.com/matchms/matchms) [#1](https://github.com/iomega/spec2vec/pull/1) [#4](https://github.com/iomega/spec2vec/pull/4) 136 | - Updated packaging related configuration 137 | - Update the GH Actions workflows 138 | - Updated the documentation 139 | - Updated the badges 140 | - Updated the integration and unit tests 141 | - Zenodo metadata 142 | 143 | ### Fixed 144 | 145 | ### Removed 146 | 147 | - Fossa configuration 148 | - Flowchart 149 | 150 | [Unreleased]: https://github.com/iomega/spec2vec/compare/0.8.0...HEAD 151 | [0.8.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0 152 | [0.7.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0 153 | [0.6.0]: https://github.com/iomega/spec2vec/compare/0.5.0...0.6.0 154 | [0.5.0]: https://github.com/iomega/spec2vec/compare/0.4.0...0.5.0 155 | [0.4.0]: https://github.com/iomega/spec2vec/compare/0.3.4...0.4.0 156 | [0.3.4]: https://github.com/iomega/spec2vec/compare/0.3.3...0.3.4 157 | [0.3.3]: https://github.com/iomega/spec2vec/compare/0.3.2...0.3.3 158 | [0.3.2]: https://github.com/iomega/spec2vec/compare/0.3.1...0.3.2 159 | [0.3.1]: https://github.com/iomega/spec2vec/compare/0.3.0...0.3.1 160 | [0.3.0]: https://github.com/iomega/spec2vec/compare/0.2.0...0.3.0 161 | [0.2.0]: https://github.com/iomega/spec2vec/compare/0.1.0...0.2.0 162 | [0.1.0]: https://github.com/iomega/spec2vec/releases/tag/0.1.0 163 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # YAML 1.2 2 | --- 3 | abstract: "Word2Vec based similarity measure of mass spectrometry data." 4 | authors: 5 | - 6 | affiliation: "Netherlands eScience Center" 7 | family-names: Huber 8 | given-names: Florian 9 | orcid: "https://orcid.org/0000-0002-3535-9406" 10 | - 11 | affiliation: "Wageningen University and Research" 12 | family-names: Hooft 13 | name-particle: van der 14 | given-names: Justin J. J. 15 | orcid: "https://orcid.org/0000-0002-9340-5511" 16 | - 17 | affiliation: "Netherlands eScience Center" 18 | family-names: Spaaks 19 | given-names: Jurriaan H. 20 | orcid: "https://orcid.org/0000-0002-7064-4069" 21 | - 22 | affiliation: "Netherlands eScience Center" 23 | family-names: Diblen 24 | given-names: Faruk 25 | orcid: "https://orcid.org/0000-0002-0989-929X" 26 | - 27 | affiliation: "Netherlands eScience Center" 28 | family-names: Verhoeven 29 | given-names: Stefan 30 | orcid: "https://orcid.org/0000-0002-5821-2060" 31 | - 32 | affiliation: "Netherlands eScience Center" 33 | family-names: Geng 34 | given-names: Cunliang 35 | orcid: "https://orcid.org/0000-0002-1409-8358" 36 | - 37 | affiliation: "Netherlands eScience Center" 38 | family-names: Meijer 39 | given-names: Christiaan 40 | orcid: "https://orcid.org/0000-0002-5529-5761" 41 | - 42 | affiliation: "University of Glasgow" 43 | family-names: Rogers 44 | given-names: Simon 45 | orcid: "https://orcid.org/0000-0003-3578-4477" 46 | - 47 | affiliation: "Netherlands eScience Center" 48 | family-names: Belloum 49 | given-names: Adam 50 | orcid: "https://orcid.org/0000-0001-6306-6937" 51 | - 52 | affiliation: "Netherlands eScience Center" 53 | family-names: Spreeuw 54 | given-names: Hanno 55 | orcid: "https://orcid.org/0000-0002-5057-0322" 56 | - 57 | affiliation: "Netherlands eScience Center" 58 | family-names: de Jonge 59 | given-names: Niek 60 | orcid: "https://orcid.org/0000-0002-3054-6210" 61 | - 62 | affiliation: "ICS, Masaryk University" 63 | family-names: Skoryk 64 | given-names: Maksym 65 | orcid: "https://orcid.org/0000-0003-2056-8018" 66 | 67 | cff-version: "1.1.0" 68 | keywords: 69 | - Word2Vec 70 | - "similarity measures" 71 | - "mass spectrometry" 72 | license: "Apache-2.0" 73 | message: "If you use this software, please cite it using these metadata." 74 | repository-code: "https://github.com/iomega/spec2vec" 75 | title: spec2vec 76 | ... 77 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | Contributor Covenant Code of Conduct 3 | ############################################################################### 4 | 5 | Our Pledge 6 | ********** 7 | 8 | In the interest of fostering an open and welcoming environment, we as 9 | contributors and maintainers pledge to making participation in our project and 10 | our community a harassment-free experience for everyone, regardless of age, body 11 | size, disability, ethnicity, gender identity and expression, level of experience, 12 | education, socio-economic status, nationality, personal appearance, race, 13 | religion, or sexual identity and orientation. 14 | 15 | Our Standards 16 | ************* 17 | 18 | Examples of behavior that contributes to creating a positive environment 19 | include: 20 | 21 | * Using welcoming and inclusive language 22 | * Being respectful of differing viewpoints and experiences 23 | * Gracefully accepting constructive criticism 24 | * Focusing on what is best for the community 25 | * Showing empathy towards other community members 26 | 27 | Examples of unacceptable behavior by participants include: 28 | 29 | * The use of sexualized language or imagery and unwelcome sexual attention or 30 | advances 31 | * Trolling, insulting/derogatory comments, and personal or political attacks 32 | * Public or private harassment 33 | * Publishing others' private information, such as a physical or electronic 34 | address, without explicit permission 35 | * Other conduct which could reasonably be considered inappropriate in a 36 | professional setting 37 | 38 | Our Responsibilities 39 | ******************** 40 | 41 | Project maintainers are responsible for clarifying the standards of acceptable 42 | behavior and are expected to take appropriate and fair corrective action in 43 | response to any instances of unacceptable behavior. 44 | 45 | Project maintainers have the right and responsibility to remove, edit, or 46 | reject comments, commits, code, wiki edits, issues, and other contributions 47 | that are not aligned to this Code of Conduct, or to ban temporarily or 48 | permanently any contributor for other behaviors that they deem inappropriate, 49 | threatening, offensive, or harmful. 50 | 51 | Scope 52 | ***** 53 | 54 | This Code of Conduct applies both within project spaces and in public spaces 55 | when an individual is representing the project or its community. Examples of 56 | representing a project or community include using an official project e-mail 57 | address, posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. Representation of a project may be 59 | further defined and clarified by project maintainers. 60 | 61 | Enforcement 62 | *********** 63 | 64 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 65 | reported by contacting the project team at generalization@esciencecenter.nl. All 66 | complaints will be reviewed and investigated and will result in a response that 67 | is deemed necessary and appropriate to the circumstances. The project team is 68 | obligated to maintain confidentiality with regard to the reporter of an incident. 69 | Further details of specific enforcement policies may be posted separately. 70 | 71 | Project maintainers who do not follow or enforce the Code of Conduct in good 72 | faith may face temporary or permanent repercussions as determined by other 73 | members of the project's leadership. 74 | 75 | Attribution 76 | *********** 77 | 78 | This Code of Conduct is adapted from the `Contributor Covenant `_, version 1.4, 79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 80 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing guidelines 2 | 3 | We welcome any kind of contribution to our software, from simple comment or question to a full fledged [pull request](https://help.github.com/articles/about-pull-requests/). Please read and follow our [Code of Conduct](CODE_OF_CONDUCT.rst). 4 | 5 | A contribution can be one of the following cases: 6 | 7 | 1. you have a question; 8 | 1. you think you may have found a bug (including unexpected behavior); 9 | 1. you want to make some kind of change to the code base (e.g. to fix a bug, to add a new feature, to update documentation); 10 | 1. you want to make a new release of the code base. 11 | 12 | The sections below outline the steps in each case. 13 | 14 | ## You have a question 15 | 16 | 1. use the search functionality [here](https://github.com/iomega/spec2vec/issues) to see if someone already filed the same issue; 17 | 1. if your issue search did not yield any relevant results, make a new issue; 18 | 1. apply the "Question" label; apply other labels when relevant. 19 | 20 | ## You think you may have found a bug 21 | 22 | 1. use the search functionality [here](https://github.com/iomega/spec2vec/issues) to see if someone already filed the same issue; 23 | 1. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include: 24 | - the [SHA hashcode](https://help.github.com/articles/autolinked-references-and-urls/#commit-shas) of the commit that is causing your problem; 25 | - some identifying information (name and version number) for dependencies you're using; 26 | - information about the operating system; 27 | 1. apply relevant labels to the newly created issue. 28 | 29 | ## You want to make some kind of change to the code base 30 | 31 | 1. (**important**) announce your plan to the rest of the community *before you start working*. This announcement should be in the form of a (new) issue; 32 | 1. (**important**) wait until some kind of consensus is reached about your idea being a good idea; 33 | 1. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest master commit. While working on your feature branch, make sure to stay up to date with the master branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions [here](https://help.github.com/articles/configuring-a-remote-for-a-fork/) and [here](https://help.github.com/articles/syncing-a-fork/)); 34 | 1. make sure the existing tests still work by running ``python setup.py test``; 35 | 1. add your own tests (if necessary); 36 | 1. update or expand the documentation; 37 | 1. update the `CHANGELOG.md` file with change; 38 | 1. [push](http://rogerdudler.github.io/git-guide/>) your feature branch to (your fork of) the spec2vec repository on GitHub; 39 | 1. create the pull request, e.g. following the instructions [here](https://help.github.com/articles/creating-a-pull-request/). 40 | 41 | In case you feel like you've made a valuable contribution, but you don't know how to write or run tests for it, or how to generate the documentation: don't let this discourage you from making the pull request; we can help you! Just go ahead and submit the pull request, but keep in mind that you might be asked to append additional commits to your pull request. 42 | 43 | ## You want to make a new release of the code base 44 | 45 | To create release you need write permission on the repository. 46 | 47 | 1. Check author list in `citation.cff` and `.zenodo.json` files 48 | 1. Bump the version using `bump2version `. For example, `bump2version major` will increase major version numbers everywhere its needed (code, meta, etc.) in the repo. 49 | 1. Update the `CHANGELOG.md` to include changes made 50 | 1. Goto [GitHub release page](https://github.com/iomega/spec2vec/releases) 51 | 1. Press draft a new release button 52 | 1. Fill version, title and description field 53 | 1. Press the Publish Release button 54 | 55 | A GitHub action will run which will publish the new version to [anaconda](https://anaconda.org/nlesc/spec2vec). 56 | Also a Zenodo entry will be made for the release with its own DOI. 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "{}" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | `fair-software.nl `_ recommendations: 2 | 3 | |GitHub Badge| 4 | |License Badge| 5 | |Conda Badge| |Pypi Badge| |Research Software Directory Badge| 6 | |Zenodo Badge| 7 | |CII Best Practices Badge| |Howfairis Badge| 8 | 9 | Code quality checks: 10 | 11 | |GitHub Workflow Status| 12 | |ReadTheDocs Badge| 13 | |Sonarcloud Quality Gate Badge| |Sonarcloud Coverage Badge| 14 | 15 | ################################################################################ 16 | spec2vec 17 | ################################################################################ 18 | **Spec2vec** is a novel spectral similarity score inspired by a natural language processing 19 | algorithm -- Word2Vec. Where Word2Vec learns relationships between words in sentences, 20 | **spec2vec** does so for mass fragments and neutral losses in MS/MS spectra. 21 | The spectral similarity score is based on spectral embeddings learnt 22 | from the fragmental relationships within a large set of spectral data. 23 | 24 | If you use **spec2vec** for your research, please cite the following references: 25 | 26 | Huber F, Ridder L, Verhoeven S, Spaaks JH, Diblen F, Rogers S, van der Hooft JJJ, (2021) "Spec2Vec: Improved mass spectral similarity scoring through learning of structural relationships". PLoS Comput Biol 17(2): e1008724. `doi:10.1371/journal.pcbi.1008724 `_ 27 | 28 | (and if you use **matchms** as well: 29 | F. Huber, S. Verhoeven, C. Meijer, H. Spreeuw, E. M. Villanueva Castilla, C. Geng, J.J.J. van der Hooft, S. Rogers, A. Belloum, F. Diblen, J.H. Spaaks, (2020). "matchms - processing and similarity evaluation of mass spectrometry data". Journal of Open Source Software, 5(52), 2411, https://doi.org/10.21105/joss.02411 ) 30 | 31 | Thanks! 32 | 33 | 34 | 35 | .. |GitHub Badge| image:: https://img.shields.io/badge/github-repo-000.svg?logo=github&labelColor=gray&color=blue 36 | :target: https://github.com/iomega/spec2vec 37 | :alt: GitHub Badge 38 | 39 | .. |License Badge| image:: https://img.shields.io/github/license/iomega/spec2vec 40 | :target: https://github.com/iomega/spec2vec 41 | :alt: License Badge 42 | 43 | .. |Conda Badge| image:: https://img.shields.io/conda/v/bioconda/spec2vec?color=blue 44 | :target: https://bioconda.github.io/recipes/spec2vec/README.html 45 | :alt: Conda Badge (Bioconda) 46 | 47 | .. |Pypi Badge| image:: https://img.shields.io/pypi/v/spec2vec?color=blue 48 | :target: https://pypi.org/project/spec2vec/ 49 | :alt: spec2vec on PyPI 50 | 51 | .. |Research Software Directory Badge| image:: https://img.shields.io/badge/rsd-spec2vec-00a3e3.svg 52 | :target: https://www.research-software.nl/software/spec2vec 53 | :alt: Research Software Directory Badge 54 | 55 | .. |Zenodo Badge| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3873169.svg 56 | :target: https://doi.org/10.5281/zenodo.3873169 57 | :alt: Zenodo Badge 58 | 59 | .. |CII Best Practices Badge| image:: https://bestpractices.coreinfrastructure.org/projects/3967/badge 60 | :target: https://bestpractices.coreinfrastructure.org/projects/3967 61 | :alt: CII Best Practices Badge 62 | 63 | .. |Howfairis Badge| image:: https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green 64 | :target: https://fair-software.eu 65 | :alt: Howfairis Badge 66 | 67 | .. |ReadTheDocs Badge| image:: https://readthedocs.org/projects/spec2vec/badge/?version=latest 68 | :alt: Documentation Status 69 | :scale: 100% 70 | :target: https://spec2vec.readthedocs.io/en/latest/?badge=latest 71 | 72 | .. |Sonarcloud Quality Gate Badge| image:: https://sonarcloud.io/api/project_badges/measure?project=iomega_spec2vec&metric=alert_status 73 | :target: https://sonarcloud.io/dashboard?id=iomega_spec2vec 74 | :alt: Sonarcloud Quality Gate 75 | 76 | .. |Sonarcloud Coverage Badge| image:: https://sonarcloud.io/api/project_badges/measure?project=iomega_spec2vec&metric=coverage 77 | :target: https://sonarcloud.io/component_measures?id=iomega_spec2vec&metric=Coverage&view=list 78 | :alt: Sonarcloud Coverage 79 | 80 | .. |GitHub Workflow Status| image:: https://img.shields.io/github/actions/workflow/status/matchms/spec2vec/CI_build.yml?branch=master 81 | :target: https://img.shields.io/github/workflow/status/iomega/spec2vec/CI%20Build 82 | :alt: GitHub Workflow Status 83 | 84 | 85 | *********************** 86 | Documentation for users 87 | *********************** 88 | For more extensive documentation `see our readthedocs `_ or get started with our `spec2vec introduction tutorial `_. 89 | 90 | Versions 91 | ======== 92 | Since version `0.5.0` Spec2Vec uses `gensim >= 4.0.0` which should make it faster and more future proof. Model trained with older versions should still be importable without any issues. If you had scripts that used additional gensim code, however, those might occationally need some adaptation, see also the `gensim documentation on how to migrate your code `_. 93 | 94 | 95 | Installation 96 | ============ 97 | 98 | 99 | Prerequisites: 100 | 101 | - Python 3.7, 3.8, or 3.9 102 | - Recommended: Anaconda 103 | 104 | We recommend installing spec2vec from Anaconda Cloud with 105 | 106 | .. code-block:: console 107 | 108 | conda create --name spec2vec python=3.8 109 | conda activate spec2vec 110 | conda install --channel bioconda --channel conda-forge spec2vec 111 | 112 | Alternatively, spec2vec can also be installed using ``pip``. When using spec2vec together with ``matchms`` it is important to note that only the Anaconda install will make sure that also ``rdkit`` is installed properly, which is requried for a few matchms filter functions (it is not required for any spec2vec related functionalities though). 113 | 114 | .. code-block:: console 115 | 116 | pip install spec2vec 117 | 118 | Examples 119 | ======== 120 | Below a code example of how to process a large data set of reference spectra to 121 | train a word2vec model from scratch. Spectra are converted to documents using ``SpectrumDocument`` which converts spectrum peaks into "words" according to their m/z ratio (for instance "peak@100.39"). A new word2vec model can then trained using ``train_new_word2vec_model`` which will set the training parameters to spec2vec defaults unless specified otherwise. Word2Vec models learn from co-occurences of peaks ("words") across many different spectra. 122 | To get a model that can give a meaningful representation of a set of 123 | given spectra it is desirable to train the model on a large and representative 124 | dataset. 125 | 126 | .. code-block:: python 127 | 128 | import os 129 | import matchms.filtering as msfilters 130 | from matchms.importing import load_from_mgf 131 | from spec2vec import SpectrumDocument 132 | from spec2vec.model_building import train_new_word2vec_model 133 | 134 | def spectrum_processing(s): 135 | """This is how one would typically design a desired pre- and post- 136 | processing pipeline.""" 137 | s = msfilters.default_filters(s) 138 | s = msfilters.add_parent_mass(s) 139 | s = msfilters.normalize_intensities(s) 140 | s = msfilters.reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5, n_max=500) 141 | s = msfilters.select_by_mz(s, mz_from=0, mz_to=1000) 142 | s = msfilters.add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) 143 | s = msfilters.require_minimum_number_of_peaks(s, n_required=10) 144 | return s 145 | 146 | # Load data from MGF file and apply filters 147 | spectrums = [spectrum_processing(s) for s in load_from_mgf("reference_spectrums.mgf")] 148 | 149 | # Omit spectrums that didn't qualify for analysis 150 | spectrums = [s for s in spectrums if s is not None] 151 | 152 | # Create spectrum documents 153 | reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums] 154 | 155 | model_file = "references.model" 156 | model = train_new_word2vec_model(reference_documents, iterations=[10, 20, 30], filename=model_file, 157 | workers=2, progress_logger=True) 158 | 159 | Once a word2vec model has been trained, spec2vec allows to calculate the similarities 160 | between mass spectrums based on this model. In cases where the word2vec model was 161 | trained on data different than the data it is applied for, a number of peaks ("words") 162 | might be unknown to the model (if they weren't part of the training dataset). To 163 | account for those cases it is important to specify the ``allowed_missing_percentage``, 164 | as in the example below. 165 | 166 | .. code-block:: python 167 | 168 | import gensim 169 | from matchms import calculate_scores 170 | from spec2vec import Spec2Vec 171 | 172 | # query_spectrums loaded from files using https://matchms.readthedocs.io/en/latest/api/matchms.importing.load_from_mgf.html 173 | query_spectrums = [spectrum_processing(s) for s in load_from_mgf("query_spectrums.mgf")] 174 | 175 | # Omit spectrums that didn't qualify for analysis 176 | query_spectrums = [s for s in query_spectrums if s is not None] 177 | 178 | # Import pre-trained word2vec model (see code example above) 179 | model_file = "references.model" 180 | model = gensim.models.Word2Vec.load(model_file) 181 | 182 | # Define similarity_function 183 | spec2vec_similarity = Spec2Vec(model=model, intensity_weighting_power=0.5, 184 | allowed_missing_percentage=5.0) 185 | 186 | # Calculate scores on all combinations of reference spectrums and queries 187 | scores = calculate_scores(reference_documents, query_spectrums, spec2vec_similarity) 188 | 189 | # Find the highest scores for a query spectrum of interest 190 | best_matches = scores.scores_by_query(query_documents[0], sort=True)[:10] 191 | 192 | # Return highest scores 193 | print([x[1] for x in best_matches]) 194 | 195 | 196 | Glossary of terms 197 | ================= 198 | 199 | .. list-table:: 200 | :header-rows: 1 201 | 202 | * - Term 203 | - Description 204 | * - adduct / addition product 205 | - During ionization in a mass spectrometer, the molecules of the injected compound break apart 206 | into fragments. When fragments combine into a new compound, this is known as an addition 207 | product, or adduct. `Wikipedia `__ 208 | * - GNPS 209 | - Knowledge base for sharing of mass spectrometry data (`link `__). 210 | * - InChI / :code:`INCHI` 211 | - InChI is short for International Chemical Identifier. InChIs are useful 212 | in retrieving information associated with a certain molecule from a 213 | database. 214 | * - InChIKey / InChI key / :code:`INCHIKEY` 215 | - An indentifier for molecules. For example, the InChI key for carbon 216 | dioxide is :code:`InChIKey=CURLTUGMZLYLDI-UHFFFAOYSA-N` (yes, it 217 | includes the substring :code:`InChIKey=`). 218 | * - MGF File / Mascot Generic Format 219 | - A plan ASCII file format to store peak list data from a mass spectrometry experiment. Links: `matrixscience.com `__, 220 | `fiehnlab.ucdavis.edu `__. 221 | * - parent mass / :code:`parent_mass` 222 | - Actual mass (in Dalton) of the original compound prior to fragmentation. 223 | It can be recalculated from the precursor m/z by taking 224 | into account the charge state and proton/electron masses. 225 | * - precursor m/z / :code:`precursor_mz` 226 | - Mass-to-charge ratio of the compound targeted for fragmentation. 227 | * - SMILES 228 | - A line notation for describing the structure of chemical species using 229 | short ASCII strings. For example, water is encoded as :code:`O[H]O`, 230 | carbon dioxide is encoded as :code:`O=C=O`, etc. SMILES-encoded species may be converted to InChIKey `using a resolver like this one `__. The Wikipedia entry for SMILES is `here `__. 231 | 232 | 233 | **************************** 234 | Documentation for developers 235 | **************************** 236 | 237 | Installation 238 | ============ 239 | 240 | To install spec2vec, do: 241 | 242 | .. code-block:: console 243 | 244 | git clone https://github.com/iomega/spec2vec.git 245 | cd spec2vec 246 | conda env create --file conda/environment-dev.yml 247 | conda activate spec2vec-dev 248 | pip install --editable . 249 | 250 | Run the linter with: 251 | 252 | .. code-block:: console 253 | 254 | prospector 255 | 256 | Run tests (including coverage) with: 257 | 258 | .. code-block:: console 259 | 260 | pytest 261 | 262 | 263 | Conda package 264 | ============= 265 | 266 | The conda packaging is handled by a `recipe at Bioconda `_. 267 | 268 | Publishing to PyPI will trigger the creation of a `pull request on the bioconda recipes repository `_ 269 | Once the PR is merged the new version of matchms will appear on `https://anaconda.org/bioconda/spec2vec `_ 270 | 271 | 272 | To remove spec2vec package from the active environment: 273 | 274 | .. code-block:: console 275 | 276 | conda remove spec2vec 277 | 278 | 279 | To remove spec2vec environment: 280 | 281 | .. code-block:: console 282 | 283 | conda env remove --name spec2vec 284 | 285 | Contributing 286 | ============ 287 | 288 | If you want to contribute to the development of spec2vec, 289 | have a look at the `contribution guidelines `_. 290 | 291 | ******* 292 | License 293 | ******* 294 | 295 | Copyright (c) 2023, Netherlands eScience Center & Düsseldorf University of Applied Sciences 296 | 297 | Licensed under the Apache License, Version 2.0 (the "License"); 298 | you may not use this file except in compliance with the License. 299 | You may obtain a copy of the License at 300 | 301 | http://www.apache.org/licenses/LICENSE-2.0 302 | 303 | Unless required by applicable law or agreed to in writing, software 304 | distributed under the License is distributed on an "AS IS" BASIS, 305 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 306 | See the License for the specific language governing permissions and 307 | limitations under the License. 308 | 309 | ******* 310 | Credits 311 | ******* 312 | 313 | This package was created with `Cookiecutter 314 | `_ and the `NLeSC/python-template 315 | `_. 316 | -------------------------------------------------------------------------------- /conda/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/conda/README.md -------------------------------------------------------------------------------- /conda/environment-build.yml: -------------------------------------------------------------------------------- 1 | name: spec2vec-build 2 | channels: 3 | - defaults 4 | dependencies: 5 | - anaconda-client 6 | - conda-build 7 | - conda-verify 8 | - python >=3.7 9 | -------------------------------------------------------------------------------- /conda/environment-dev.yml: -------------------------------------------------------------------------------- 1 | name: spec2vec-dev 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | - nlesc 7 | dependencies: 8 | - gensim >=4.2.0 9 | - matchms >=0.6.2 10 | - numba >=0.51 11 | - numpy 12 | - pip 13 | - python >=3.7 14 | - scipy 15 | - tqdm 16 | - pip: 17 | - -e ..[dev] 18 | -------------------------------------------------------------------------------- /conda/environment.yml: -------------------------------------------------------------------------------- 1 | name: spec2vec 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - gensim >=4.2.0 8 | - matchms >=0.6.2 9 | - numba >=0.51 10 | - numpy 11 | - python >=3.7 12 | - scipy 13 | - tqdm 14 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "spec2vec" %} 2 | {% set version = "0.8.1" %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | path: .. 10 | 11 | extra: 12 | channels: 13 | - nlesc 14 | - conda-forge 15 | - bioconda 16 | 17 | build: 18 | noarch: python 19 | preserve_egg_dir: True 20 | number: 0 21 | skip: True # [py2k] 22 | script: {{ PYTHON }} -m pip install --no-deps --ignore-installed . -vv 23 | 24 | requirements: 25 | build: 26 | - conda-build 27 | - conda-verify 28 | - pytest-runner 29 | - python 30 | - matchms >=0.6.2 31 | - numpy {{ numpy }} 32 | - setuptools 33 | host: 34 | - python >=3.7 35 | - pip 36 | - pytest-runner 37 | - setuptools 38 | run: 39 | - gensim >=4.2.0 40 | - matchms >=0.14.0, <=0.26.4 41 | - numba >=0.51 42 | - numpy 43 | - pip 44 | - python >=3.7 45 | - scipy <=1.10.1 46 | - tqdm 47 | 48 | test: 49 | imports: 50 | - spec2vec 51 | 52 | about: 53 | home: https://github.com/iomega/spec2vec 54 | license: Apache-2.0 55 | license_family: APACHE 56 | license_file: LICENSE 57 | summary: Word2Vec based similarity measure of mass spectrometry data. 58 | description: Word2Vec based similarity measure of mass spectrometry data. 59 | doc_url: https://spec2vec.readthedocs.io/ 60 | dev_url: https://github.com/iomega/spec2vec 61 | 62 | extra: 63 | recipe-maintainers: 64 | - fdiblen 65 | - florian-huber 66 | -------------------------------------------------------------------------------- /integration-tests/test_user_workflow_spec2vec.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/integration-tests/test_user_workflow_spec2vec.model -------------------------------------------------------------------------------- /integration-tests/test_user_workflow_spec2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim 3 | import numpy as np 4 | from matchms import calculate_scores 5 | from matchms.filtering import (add_losses, add_parent_mass, default_filters, 6 | normalize_intensities, 7 | reduce_to_number_of_peaks, 8 | require_minimum_number_of_peaks, select_by_mz) 9 | from matchms.importing import load_from_mgf 10 | from spec2vec import Spec2Vec, SpectrumDocument 11 | 12 | 13 | def test_user_workflow_spec2vec(): 14 | """Test typical user workflow to get from mass spectra to spec2vec similarities. 15 | 16 | This test will run a typical workflow example using a small dataset and a 17 | pretrained word2vec model. One main aspect of this is to test if users will 18 | get exactly the same spec2vec similarity scores when starting from a word2vec 19 | model that was trained and saved elsewhere. 20 | """ 21 | def apply_my_filters(s): 22 | """This is how a user would typically design his own pre- and post- 23 | processing pipeline.""" 24 | s = default_filters(s) 25 | s = add_parent_mass(s) 26 | s = normalize_intensities(s) 27 | s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) 28 | s = select_by_mz(s, mz_from=0, mz_to=1000) 29 | s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) 30 | s = require_minimum_number_of_peaks(s, n_required=5) 31 | return s 32 | 33 | repository_root = os.path.join(os.path.dirname(__file__), "..") 34 | spectrums_file = os.path.join(repository_root, "tests", "data", "pesticides.mgf") 35 | 36 | # apply my filters to the data 37 | spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] 38 | 39 | # omit spectrums that didn't qualify for analysis 40 | spectrums = [s for s in spectrums if s is not None] 41 | 42 | # convert spectrums to spectrum 'documents' 43 | documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums] 44 | 45 | model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") 46 | if os.path.isfile(model_file): 47 | model = gensim.models.Word2Vec.load(model_file) 48 | else: 49 | # create and train model 50 | model = gensim.models.Word2Vec([d.words for d in documents], size=5, min_count=1) 51 | model.train([d.words for d in documents], total_examples=len(documents), epochs=20) 52 | model.save(model_file) 53 | 54 | # define similarity_function 55 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 56 | 57 | references = documents[:26] 58 | queries = documents[25:] 59 | 60 | # calculate scores on all combinations of references and queries 61 | scores = list(calculate_scores(references, queries, spec2vec)) 62 | 63 | # filter out self-comparisons 64 | filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] 65 | 66 | sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) 67 | 68 | actual_top10 = sorted_by_score[:10] 69 | 70 | expected_top10 = [ 71 | (documents[19], documents[25], 0.9999121928249473), 72 | (documents[20], documents[25], 0.9998846890269892), 73 | (documents[20], documents[45], 0.9998756073673759), 74 | (documents[25], documents[45], 0.9998750427994474), 75 | (documents[19], documents[27], 0.9998722768460854), 76 | (documents[22], documents[27], 0.9998633023352553), 77 | (documents[18], documents[27], 0.9998616961532616), 78 | (documents[19], documents[45], 0.9998528723697396), 79 | (documents[14], documents[71], 0.9998404364805897), 80 | (documents[20], documents[27], 0.9998336807761137) 81 | ] 82 | 83 | assert [x[0] for x in actual_top10] == [x[0] for x in expected_top10] 84 | assert [x[1] for x in actual_top10] == [x[1] for x in expected_top10] 85 | assert np.allclose([x[2][0] for x in actual_top10], [x[2] for x in expected_top10]), "Expected different top 10 table." 86 | -------------------------------------------------------------------------------- /readthedocs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /readthedocs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | import spec2vec 16 | 17 | 18 | d = os.path.dirname(os.path.realpath(__file__)) 19 | sys.path.insert(0, os.path.join(d, "..")) 20 | 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = "spec2vec" 25 | copyright = "2020, Netherlands eScience Center" 26 | author = "Netherlands eScience Center" 27 | 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.coverage", 37 | "sphinx.ext.intersphinx", 38 | "sphinx.ext.viewcode", 39 | "sphinx.ext.todo", 40 | "sphinx.ext.doctest", 41 | "sphinxcontrib.apidoc", 42 | "sphinx.ext.napoleon", 43 | ] 44 | 45 | apidoc_module_dir = "../spec2vec" 46 | apidoc_output_dir = "./api" 47 | apidoc_excluded_paths = ["tests", "readthedocs"] 48 | apidoc_separate_modules = True 49 | apidoc_module_first = True 50 | # Hide undocumented member by excluding default undoc-members option 51 | os.environ["SPHINX_APIDOC_OPTIONS"] = "members,show-inheritance" 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ["_templates"] 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | # 59 | # This is also used if you do content translation via gettext catalogs. 60 | # Usually you set "language" from the command line for these cases. 61 | language = "en" 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | # This pattern also affects html_static_path and html_extra_path. 66 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "readthedocs/conf.rst"] 67 | 68 | # Include class __init__ and __call__ docstrings. 69 | autodoc_default_options = { 70 | 'special-members': '__init__,__call__', 71 | } 72 | 73 | # -- Options for HTML output ------------------------------------------------- 74 | 75 | # The theme to use for HTML and HTML Help pages. See the documentation for 76 | # a list of builtin themes. 77 | # 78 | html_theme = "alabaster" 79 | 80 | html_theme_options = { 81 | "github_user": "spec2vec", 82 | "github_repo": "spec2vec", 83 | } 84 | 85 | # Add any paths that contain custom static files (such as style sheets) here, 86 | # relative to this directory. They are copied after the builtin static files, 87 | # so a file named "default.css" will overwrite the builtin "default.css". 88 | html_static_path = [] 89 | 90 | # -- Extension configuration ------------------------------------------------- 91 | 92 | # -- Options for todo extension ---------------------------------------------- 93 | 94 | # If true, `todo` and `todoList` produce output, else they produce nothing. 95 | todo_include_todos = True 96 | 97 | # -- Options for intersphinx extension ---------------------------------------------- 98 | 99 | intersphinx_mapping = { 100 | "https://docs.python.org/3": None, 101 | "numpy": ("https://docs.scipy.org/doc/numpy", None), 102 | "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), 103 | "gensim": ("https://radimrehurek.com/gensim", None), 104 | "matchms": ("https://matchms.readthedocs.io/en/latest", None), 105 | } 106 | -------------------------------------------------------------------------------- /readthedocs/index.rst: -------------------------------------------------------------------------------- 1 | .. spec2vec documentation master file, created by 2 | sphinx-quickstart on Tue Apr 7 09:16:44 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to spec2vec's documentation! 7 | ==================================== 8 | 9 | Word2Vec based similarity measure of mass spectrometry data. 10 | 11 | .. toctree:: 12 | :maxdepth: 3 13 | :caption: Contents: 14 | 15 | API 16 | 17 | Installation 18 | ============ 19 | 20 | Prerequisites: 21 | 22 | - Python 3.7 or 3.8 23 | - Recommended: Anaconda 24 | 25 | We recommend installing spec2vec from Anaconda Cloud with 26 | 27 | .. code-block:: console 28 | 29 | # install spec2vec in a new virtual environment to avoid dependency clashes 30 | conda create --name spec2vec python=3.8 31 | conda activate spec2vec 32 | conda install --channel nlesc --channel bioconda --channel conda-forge spec2vec 33 | 34 | Alternatively, spec2vec can also be installed using ``pip``. When using spec2vec together with ``matchms`` it is important to note that only the Anaconda install will make sure that also ``rdkit`` is installed properly, which is requried for a few matchms filter functions (it is not required for any spec2vec related functionalities though). 35 | 36 | .. code-block:: console 37 | 38 | pip install spec2vec 39 | 40 | Examples 41 | ======== 42 | 43 | Train a word2vec model 44 | ********************** 45 | Below a code example of how to process a large data set of reference spectra to 46 | train a word2vec model from scratch. Spectra are converted to documents using :py:class:`~spec2vec.SpectrumDocument` which converts spectrum peaks into "words" according to their m/z ratio (for instance ``peak@100.39``). A new word2vec model can then trained using :py:func:`~spec2vec.model_building.train_new_word2vec_model` which will set the training parameters to spec2vec defaults unless specified otherwise. Word2Vec models learn from co-occurences of peaks ("words") across many different spectra. 47 | To get a model that can give a meaningful representation of a set of 48 | given spectra it is desirable to train the model on a large and representative 49 | dataset. 50 | 51 | .. code-block:: python 52 | 53 | import os 54 | from matchms.filtering import add_losses 55 | from matchms.filtering import add_parent_mass 56 | from matchms.filtering import default_filters 57 | from matchms.filtering import normalize_intensities 58 | from matchms.filtering import reduce_to_number_of_peaks 59 | from matchms.filtering import require_minimum_number_of_peaks 60 | from matchms.filtering import select_by_mz 61 | from matchms.importing import load_from_mgf 62 | from spec2vec import SpectrumDocument 63 | from spec2vec.model_building import train_new_word2vec_model 64 | 65 | def spectrum_processing(s): 66 | """This is how one would typically design a desired pre- and post- 67 | processing pipeline.""" 68 | s = default_filters(s) 69 | s = add_parent_mass(s) 70 | s = normalize_intensities(s) 71 | s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5, n_max=500) 72 | s = select_by_mz(s, mz_from=0, mz_to=1000) 73 | s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) 74 | s = require_minimum_number_of_peaks(s, n_required=10) 75 | return s 76 | 77 | # Load data from MGF file and apply filters 78 | spectrums = [spectrum_processing(s) for s in load_from_mgf("reference_spectrums.mgf")] 79 | 80 | # Omit spectrums that didn't qualify for analysis 81 | spectrums = [s for s in spectrums if s is not None] 82 | 83 | # Create spectrum documents 84 | reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums] 85 | 86 | model_file = "references.model" 87 | model = train_new_word2vec_model(reference_documents, model_file, iterations=[10, 20, 30], 88 | workers=2, progress_logger=True) 89 | 90 | Derive spec2vec similarity scores 91 | ********************************* 92 | Once a word2vec model has been trained, spec2vec allows to calculate the similarities 93 | between mass spectrums based on this model. In cases where the word2vec model was 94 | trained on data different than the data it is applied for, a number of peaks ("words") 95 | might be unknown to the model (if they weren't part of the training dataset). To 96 | account for those cases it is important to specify the ``allowed_missing_percentage``, 97 | as in the example below. 98 | 99 | .. code-block:: python 100 | 101 | import gensim 102 | from matchms import calculate_scores 103 | from spec2vec import Spec2Vec 104 | 105 | # query_spectrums loaded from files using https://matchms.readthedocs.io/en/latest/api/matchms.importing.load_from_mgf.html 106 | query_spectrums = [spectrum_processing(s) for s in load_from_mgf("query_spectrums.mgf")] 107 | 108 | # Omit spectrums that didn't qualify for analysis 109 | query_spectrums = [s for s in query_spectrums if s is not None] 110 | 111 | # Import pre-trained word2vec model (see code example above) 112 | model_file = "references.model" 113 | model = gensim.models.Word2Vec.load(model_file) 114 | 115 | # Define similarity_function 116 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, 117 | allowed_missing_percentage=5.0) 118 | 119 | # Calculate scores on all combinations of reference spectrums and queries 120 | scores = calculate_scores(reference_documents, query_spectrums, spec2vec) 121 | 122 | # Find the highest scores for a query spectrum of interest 123 | best_matches = scores.scores_by_query(query_documents[0], sort=True)[:10] 124 | 125 | # Return highest scores 126 | print([x[1] for x in best_matches]) 127 | 128 | Indices and tables 129 | ================== 130 | 131 | * :ref:`genindex` 132 | * :ref:`modindex` 133 | * :ref:`search` 134 | -------------------------------------------------------------------------------- /readthedocs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.8.0 3 | 4 | [bumpversion:file:conda/meta.yaml] 5 | search = set version = "{current_version}" 6 | replace = set version = "{new_version}" 7 | 8 | [bumpversion:file:spec2vec/__version__.py] 9 | search = __version__ = '{current_version}' 10 | replace = __version__ = '{new_version}' 11 | 12 | [isort] 13 | sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 14 | no_lines_before = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 15 | lines_after_imports = 2 16 | 17 | [metadata] 18 | description-file = README.rst 19 | 20 | [aliases] 21 | test = pytest 22 | 23 | [coverage:run] 24 | branch = True 25 | source = spec2vec 26 | 27 | [tool:pytest] 28 | testpaths = tests integration-tests 29 | python_classes = *TestSuite 30 | junit_family = xunit2 31 | 32 | [build_sphinx] 33 | source-dir = docs 34 | build-dir = docs/_build 35 | all_files = 1 36 | builder = html 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | from setuptools import find_packages, setup 4 | 5 | 6 | here = os.path.abspath(os.path.dirname(__file__)) 7 | 8 | version = {} 9 | with open(os.path.join(here, "spec2vec", "__version__.py")) as f: 10 | exec(f.read(), version) 11 | 12 | with open("README.rst") as readme_file: 13 | readme = readme_file.read() 14 | 15 | setup( 16 | name="spec2vec", 17 | version=version["__version__"], 18 | description="Word2Vec based similarity measure of mass spectrometry data.", 19 | long_description=readme, 20 | long_description_content_type="text/x-rst", 21 | author="Spec2Vec developer team", 22 | author_email="florian.huber@hs-duesseldorf.de", 23 | url="https://github.com/iomega/spec2vec", 24 | packages=find_packages(), 25 | include_package_data=True, 26 | license="Apache Software License 2.0", 27 | zip_safe=False, 28 | keywords=[ 29 | "word2vec", 30 | "mass spectrometry", 31 | "fuzzy matching", 32 | "fuzzy search" 33 | ], 34 | classifiers=[ 35 | "Development Status :: 4 - Beta", 36 | "Intended Audience :: Education", 37 | "Intended Audience :: Science/Research", 38 | "Intended Audience :: Developers", 39 | "License :: OSI Approved :: Apache Software License", 40 | "Natural Language :: English", 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3.7", 43 | "Programming Language :: Python :: 3.8", 44 | "Programming Language :: Python :: 3.9", 45 | ], 46 | test_suite="tests", 47 | python_requires='>=3.7', 48 | install_requires=[ 49 | "gensim >=4.2.0", 50 | "matchms >=0.14.0,<=0.26.4", 51 | "numba >=0.51", 52 | "numpy", 53 | "scipy <=1.10.1", 54 | "tqdm", 55 | ], 56 | extras_require={"dev": ["bump2version", 57 | "isort>=5.1.0", 58 | "pylint<2.12.0", 59 | "prospector[with_pyroma]", 60 | "pytest", 61 | "pytest-cov", 62 | "sphinx>=4.0.0", 63 | "sphinx_rtd_theme", 64 | "sphinxcontrib-apidoc", 65 | "yapf",], 66 | } 67 | ) 68 | -------------------------------------------------------------------------------- /sonar-project.properties: -------------------------------------------------------------------------------- 1 | sonar.organization=iomega 2 | sonar.projectKey=iomega_spec2vec 3 | sonar.host.url=https://sonarcloud.io 4 | sonar.sources=spec2vec/ 5 | sonar.tests=tests/,integration-tests/ 6 | sonar.links.homepage=https://github.com/iomega/spec2vec 7 | sonar.links.scm=https://github.com/iomega/spec2vec 8 | sonar.links.issue=https://github.com/iomega/spec2vec/issues 9 | sonar.links.ci=https://github.com/iomega/spec2vec/actions 10 | sonar.python.coverage.reportPaths=coverage.xml 11 | sonar.python.xunit.reportPath=xunit-result.xml 12 | sonar.python.pylint.reportPath=pylint-report.txt 13 | -------------------------------------------------------------------------------- /spec2vec/Document.py: -------------------------------------------------------------------------------- 1 | class Document: 2 | """Parent class for documents as required by spec2vec. 3 | 4 | Use this as parent class to build your own document class. An example used for 5 | mass spectra is SpectrumDocument.""" 6 | def __init__(self, obj): 7 | """ 8 | 9 | Parameters 10 | ---------- 11 | obj: 12 | Input object of desired class. 13 | """ 14 | self._obj = obj 15 | self._index = 0 16 | self._make_words() 17 | 18 | def __iter__(self): 19 | return self 20 | 21 | def __len__(self): 22 | return len(self.words) 23 | 24 | def __next__(self): 25 | """gensim.models.Word2Vec() wants its corpus elements to be iterable""" 26 | if self._index < len(self.words): 27 | word = self.words[self._index] 28 | self._index += 1 29 | return word 30 | self._index = 0 31 | raise StopIteration 32 | 33 | def __str__(self): 34 | return self.words.__str__() 35 | 36 | def _make_words(self): 37 | print("You should override this method in your own subclass.") 38 | self.words = [] 39 | return self 40 | -------------------------------------------------------------------------------- /spec2vec/Spec2Vec.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Union 3 | import numpy as np 4 | from gensim.models import Word2Vec 5 | from matchms import Spectrum 6 | from matchms.similarity.BaseSimilarity import BaseSimilarity 7 | from tqdm import tqdm 8 | from spec2vec.serialization import Word2VecLight 9 | from spec2vec.SpectrumDocument import SpectrumDocument 10 | from spec2vec.vector_operations import (calc_vector, cosine_similarity, 11 | cosine_similarity_matrix) 12 | 13 | 14 | class Spec2Vec(BaseSimilarity): 15 | """Calculate spec2vec similarity scores between a reference and a query. 16 | 17 | Using a trained model, spectrum documents will be converted into spectrum 18 | vectors. The spec2vec similarity is then the cosine similarity score between 19 | two spectrum vectors. 20 | 21 | The following code example shows how to calculate spec2vec similarities 22 | between query and reference spectrums. It uses a dummy model that can be found at 23 | :download:`../integration-tests/test_user_workflow_spec2vec.model ` 24 | and a small test dataset that can be found at 25 | :download:`../tests/pesticides.mgf `. 26 | 27 | .. testcode:: 28 | 29 | import os 30 | import gensim 31 | from matchms import calculate_scores 32 | from matchms.filtering import add_losses 33 | from matchms.filtering import default_filters 34 | from matchms.filtering import normalize_intensities 35 | from matchms.filtering import require_minimum_number_of_peaks 36 | from matchms.filtering import select_by_intensity 37 | from matchms.filtering import select_by_mz 38 | from matchms.importing import load_from_mgf 39 | from spec2vec import Spec2Vec 40 | 41 | def spectrum_processing(s): 42 | '''This is how a user would typically design his own pre- and post- 43 | processing pipeline.''' 44 | s = default_filters(s) 45 | s = normalize_intensities(s) 46 | s = select_by_mz(s, mz_from=0, mz_to=1000) 47 | s = select_by_intensity(s, intensity_from=0.01) 48 | s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) 49 | s = require_minimum_number_of_peaks(s, n_required=5) 50 | return s 51 | 52 | spectrums_file = os.path.join(os.getcwd(), "..", "tests", "data", "pesticides.mgf") 53 | 54 | # Load data and apply the above defined filters to the data 55 | spectrums = [spectrum_processing(s) for s in load_from_mgf(spectrums_file)] 56 | 57 | # Omit spectrums that didn't qualify for analysis 58 | spectrums = [s for s in spectrums if s is not None] 59 | 60 | # Load pretrained model (here dummy model) 61 | model_file = os.path.join(os.getcwd(), "..", "integration-tests", "test_user_workflow_spec2vec.model") 62 | model = gensim.models.Word2Vec.load(model_file) 63 | 64 | # Define similarity_function 65 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 66 | 67 | # Calculate scores on all combinations of references and queries 68 | scores = calculate_scores(spectrums[10:], spectrums[:10], spec2vec) 69 | 70 | # Select top-10 candidates for first query spectrum 71 | spectrum0_top10 = scores.scores_by_query(spectrums[0], sort=True)[:10] 72 | 73 | # Display spectrum IDs for top-10 matches (only works if metadata contains "spectrum_id" field) 74 | print([s[0].metadata['spectrum_id'] for s in spectrum0_top10]) 75 | 76 | Should output 77 | 78 | .. testoutput:: 79 | 80 | ['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ... 81 | 82 | """ 83 | def __init__(self, model: Union[Word2Vec, Word2VecLight], intensity_weighting_power: Union[float, int] = 0, 84 | allowed_missing_percentage: Union[float, int] = 10, progress_bar: bool = False): 85 | """ 86 | 87 | Parameters 88 | ---------- 89 | model: 90 | Expected input is a gensim word2vec model that has been trained on 91 | the desired set of spectrum documents. 92 | intensity_weighting_power: 93 | Spectrum vectors are a weighted sum of the word vectors. The given 94 | word intensities will be raised to the given power. 95 | The default is 0, which means that no weighing will be done. 96 | allowed_missing_percentage: 97 | Set the maximum allowed percentage of the document that may be missing 98 | from the input model. This is measured as percentage of the weighted, missing 99 | words compared to all word vectors of the document. Default is 10, which 100 | means up to 10% missing words are allowed. If more words are missing from 101 | the model, an empty embedding will be returned (leading to similarities of 0) 102 | and a warning is raised. 103 | progress_bar: 104 | Set to True to monitor the embedding creating with a progress bar. 105 | Default is False. 106 | """ 107 | self.model = model 108 | self.n_decimals = self._get_word_decimals(self.model) 109 | self.intensity_weighting_power = intensity_weighting_power 110 | self.allowed_missing_percentage = allowed_missing_percentage 111 | self.vector_size = model.wv.vector_size 112 | self.disable_progress_bar = not progress_bar 113 | 114 | def pair(self, reference: Union[SpectrumDocument, Spectrum], 115 | query: Union[SpectrumDocument, Spectrum]) -> float: 116 | """Calculate the spec2vec similaritiy between a reference and a query. 117 | 118 | Parameters 119 | ---------- 120 | reference: 121 | Reference spectrum or spectrum document. 122 | query: 123 | Query spectrum or spectrum document. 124 | 125 | Returns 126 | ------- 127 | spec2vec_similarity 128 | Spec2vec similarity score. 129 | """ 130 | reference_vector = self._calculate_embedding(reference) 131 | query_vector = self._calculate_embedding(query) 132 | 133 | return cosine_similarity(reference_vector, query_vector) 134 | 135 | def matrix(self, references: Union[List[SpectrumDocument], List[Spectrum]], 136 | queries: Union[List[SpectrumDocument], List[Spectrum]], 137 | array_type: str = "numpy", 138 | is_symmetric: bool = False) -> np.ndarray: 139 | """Calculate the spec2vec similarities between all references and queries. 140 | 141 | Parameters 142 | ---------- 143 | references: 144 | Reference spectrums or spectrum documents. 145 | queries: 146 | Query spectrums or spectrum documents. 147 | array_type 148 | Specify the output array type. Can be "numpy" or "sparse". 149 | Currently, only "numpy" is supported and will return a numpy array. 150 | Future versions will include "sparse" as option to return a COO-sparse array. 151 | is_symmetric: 152 | Set to True if references == queries to speed up calculation about 2x. 153 | Uses the fact that in this case score[i, j] = score[j, i]. Default is False. 154 | 155 | Returns 156 | ------- 157 | spec2vec_similarity 158 | Array of spec2vec similarity scores. 159 | """ 160 | n_rows = len(references) 161 | reference_vectors = np.empty((n_rows, self.vector_size), dtype="float") 162 | for index_reference, reference in enumerate(tqdm(references, desc='Calculating vectors of reference spectrums', 163 | disable=self.disable_progress_bar)): 164 | reference_vectors[index_reference, 0:self.vector_size] = self._calculate_embedding(reference) 165 | 166 | n_cols = len(queries) 167 | if is_symmetric: 168 | assert np.all(references == queries), \ 169 | "Expected references to be equal to queries for is_symmetric=True" 170 | query_vectors = reference_vectors 171 | else: 172 | query_vectors = np.empty((n_cols, self.vector_size), dtype="float") 173 | for index_query, query in enumerate(tqdm(queries, desc='Calculating vectors of query spectrums', 174 | disable=self.disable_progress_bar)): 175 | query_vectors[index_query, 0:self.vector_size] = self._calculate_embedding(query) 176 | 177 | spec2vec_similarity = cosine_similarity_matrix(reference_vectors, query_vectors) 178 | 179 | return spec2vec_similarity 180 | 181 | @staticmethod 182 | def _get_word_decimals(model): 183 | """Read the decimal rounding that was used to train the model""" 184 | word_regex = r"[a-z]{4}@[0-9]{1,5}." 185 | example_word = next(iter(model.wv.key_to_index)) 186 | 187 | return len(re.split(word_regex, example_word)[-1]) 188 | 189 | def _calculate_embedding(self, spectrum_in: Union[SpectrumDocument, Spectrum]): 190 | """Generate Spec2Vec embedding vectors from input spectrum (or SpectrumDocument)""" 191 | if isinstance(spectrum_in, Spectrum): 192 | spectrum_in = SpectrumDocument(spectrum_in, n_decimals=self.n_decimals) 193 | elif isinstance(spectrum_in, SpectrumDocument): 194 | assert spectrum_in.n_decimals == self.n_decimals, \ 195 | "Decimal rounding of input data does not agree with model vocabulary." 196 | else: 197 | raise ValueError("Expected input type to be Spectrum or SpectrumDocument") 198 | return calc_vector(self.model, 199 | spectrum_in, 200 | self.intensity_weighting_power, 201 | self.allowed_missing_percentage) 202 | -------------------------------------------------------------------------------- /spec2vec/SpectrumDocument.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from matchms.Spikes import Spikes 3 | from .Document import Document 4 | 5 | 6 | class SpectrumDocument(Document): 7 | """Create documents from spectra. 8 | 9 | Every peak (and loss) positions (m/z value) will be converted into a string "word". 10 | The entire list of all peak words forms a spectrum document. Peak words have 11 | the form "peak@100.32" (for n_decimals=2), and losses have the format "loss@100.32". 12 | Peaks with identical resulting strings will not be merged, hence same words can 13 | exist multiple times in a document (e.g. peaks at 100.31 and 100.29 would lead to 14 | two words "peak@100.3" when using n_decimals=1). 15 | 16 | For example: 17 | 18 | .. testcode:: 19 | 20 | import numpy as np 21 | from matchms import Spectrum 22 | from spec2vec import SpectrumDocument 23 | 24 | spectrum = Spectrum(mz=np.array([100.0, 150.0, 200.51]), 25 | intensities=np.array([0.7, 0.2, 0.1]), 26 | metadata={'compound_name': 'substance1'}) 27 | spectrum_document = SpectrumDocument(spectrum, n_decimals=1) 28 | 29 | print(spectrum_document.words) 30 | print(spectrum_document.peaks.mz) 31 | print(spectrum_document.get("compound_name")) 32 | 33 | Should output 34 | 35 | .. testoutput:: 36 | 37 | ['peak@100.0', 'peak@150.0', 'peak@200.5'] 38 | [100. 150. 200.51] 39 | substance1 40 | """ 41 | def __init__(self, spectrum, n_decimals: int = 2): 42 | """ 43 | 44 | Parameters 45 | ---------- 46 | spectrum: SpectrumType 47 | Input spectrum. 48 | n_decimals 49 | Peak positions are converted to strings with n_decimal decimals. 50 | The default is 2, which would convert a peak at 100.387 into the 51 | word "peak@100.39". 52 | """ 53 | self.n_decimals = n_decimals 54 | self.weights = None 55 | super().__init__(obj=spectrum) 56 | self._add_weights() 57 | 58 | def _make_words(self): 59 | """Create word from peaks (and losses).""" 60 | peak_words = [f"peak@{mz:.{self.n_decimals}f}" for mz in self._obj.peaks.mz] 61 | if self._obj.losses is not None: 62 | loss_words = [f"loss@{mz:.{self.n_decimals}f}" for mz in self._obj.losses.mz] 63 | else: 64 | loss_words = [] 65 | self.words = peak_words + loss_words 66 | return self 67 | 68 | def _add_weights(self): 69 | """Add peaks (and loss) intensities as weights.""" 70 | assert self._obj.peaks.intensities.max() <= 1, "peak intensities not normalized" 71 | 72 | peak_intensities = self._obj.peaks.intensities.tolist() 73 | if self._obj.losses is not None: 74 | loss_intensities = self._obj.losses.intensities.tolist() 75 | else: 76 | loss_intensities = [] 77 | self.weights = peak_intensities + loss_intensities 78 | return self 79 | 80 | def get(self, key: str, default=None): 81 | """Retrieve value from Spectrum metadata dict. Shorthand for 82 | 83 | .. code-block:: python 84 | 85 | val = self._obj.metadata[key] 86 | 87 | """ 88 | assert not hasattr(self, key), "Key cannot be attribute of SpectrumDocument class" 89 | return self._obj.get(key, default) 90 | 91 | @property 92 | def metadata(self): 93 | """Return metadata of original spectrum.""" 94 | return self._obj.metadata 95 | 96 | @property 97 | def losses(self) -> Optional[Spikes]: 98 | """Return losses of original spectrum.""" 99 | return self._obj.losses 100 | 101 | @property 102 | def peaks(self) -> Spikes: 103 | """Return peaks of original spectrum.""" 104 | return self._obj.peaks 105 | -------------------------------------------------------------------------------- /spec2vec/__init__.py: -------------------------------------------------------------------------------- 1 | from . import serialization 2 | from .__version__ import __version__ 3 | from .Document import Document 4 | from .logging_functions import _init_logger 5 | from .Spec2Vec import Spec2Vec 6 | from .SpectrumDocument import SpectrumDocument 7 | from .vector_operations import calc_vector 8 | 9 | 10 | _init_logger() 11 | 12 | 13 | __all__ = [ 14 | "__version__", 15 | "calc_vector", 16 | "Document", 17 | "serialization", 18 | "SpectrumDocument", 19 | "Spec2Vec", 20 | ] 21 | -------------------------------------------------------------------------------- /spec2vec/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.8.1' 2 | -------------------------------------------------------------------------------- /spec2vec/logging_functions.py: -------------------------------------------------------------------------------- 1 | """Spec2Vec logger. 2 | 3 | Spec2Vec functions and method report unexpected or undesired behavior as 4 | logging WARNING, and additional information as INFO. 5 | The default logging level is set to WARNING. 6 | The logger is an adaptation of the matchms logger. 7 | 8 | 9 | If you want to output additional 10 | logging messages, you can lower the logging level to INFO using set_spec2vec_logger_level: 11 | 12 | .. code-block:: python 13 | 14 | from spec2vec import set_spec2vec_logger_level 15 | 16 | set_spec2vec_logger_level("INFO") 17 | 18 | This can also be combined with setting the matchms logger which occurs separately 19 | by using set_matchms_logger_level: 20 | 21 | .. code-block:: python 22 | 23 | from matchms import set_matchms_logger_level 24 | from spec2vec import set_spec2vec_logger_level 25 | 26 | set_matchms_logger_level("INFO") 27 | set_spec2vec_logger_level("INFO") 28 | 29 | If you want to suppress logging warnings, you can also raise the logging level 30 | to ERROR by: 31 | 32 | .. code-block:: python 33 | 34 | set_spec2vec_logger_level("ERROR") 35 | 36 | To write logging entries to a local file, you can do the following: 37 | 38 | .. code-block:: python 39 | 40 | from spec2vec.logging_functions import add_logging_to_file 41 | 42 | add_logging_to_file("sample.log", loglevel="INFO") 43 | 44 | If you want to write the logging messages to a local file while silencing the 45 | stream of such messages, you can do the following: 46 | 47 | .. code-block:: python 48 | 49 | from spec2vec.logging_functions import add_logging_to_file 50 | 51 | add_logging_to_file("sample.log", loglevel="INFO", 52 | remove_stream_handlers=True) 53 | 54 | """ 55 | import logging 56 | import logging.config 57 | import sys 58 | import matchms.logging_functions as matchms_logging 59 | 60 | 61 | _formatter = logging.Formatter( 62 | '%(asctime)s:%(levelname)s:%(name)s:%(module)s:%(message)s') 63 | 64 | 65 | def _init_logger(logger_name="spec2vec"): 66 | """Initialize spec2vec logger.""" 67 | logger = logging.getLogger(logger_name) 68 | logger.setLevel(logging.WARNING) 69 | handler = logging.StreamHandler(sys.stdout) 70 | handler.setLevel(logging.WARNING) 71 | handler.setFormatter(_formatter) 72 | logger.addHandler(handler) 73 | logger.info('Completed configuring spec2vec logger.') 74 | 75 | 76 | def set_spec2vec_logger_level(loglevel: str, logger_name="spec2vec"): 77 | """Update logging level to given loglevel. 78 | 79 | Parameters 80 | ---------- 81 | loglevels 82 | Can be 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. 83 | logger_name 84 | Default is "spec2vec". Change if logger name should be different. 85 | """ 86 | matchms_logging.set_matchms_logger_level(loglevel=loglevel, logger_name=logger_name) 87 | 88 | 89 | def add_logging_to_file(filename: str, loglevel: str = "INFO", 90 | remove_stream_handlers: bool = False, 91 | logger_name="spec2vec"): 92 | """Add logging to file. 93 | 94 | Current implementation does not change the initial logging stream, 95 | but simply adds a FileHandler to write logging entries to a file. 96 | 97 | Parameters 98 | ---------- 99 | filename 100 | Name of file for write logging output. 101 | loglevels 102 | Can be 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. 103 | remove_stream_handlers 104 | Set to True if only logging to file is desired. 105 | logger_name 106 | Default is "spec2vec". Change if logger name should be different. 107 | """ 108 | matchms_logging.add_logging_to_file(filename=filename, 109 | loglevel=loglevel, 110 | remove_stream_handlers=remove_stream_handlers, 111 | logger_name=logger_name) 112 | 113 | 114 | def reset_spec2vec_logger(logger_name="spec2vec"): 115 | """Reset spec2vec logger to initial state. 116 | 117 | This will remove all logging Handlers and initialize a new spec2vec logger. 118 | Use this function to reset previous changes made to the default spec2vec logger. 119 | 120 | Parameters 121 | ---------- 122 | logger_name 123 | Default is "spec2vec". Change if logger name should be different. 124 | """ 125 | logger = logging.getLogger(logger_name) 126 | logger.handlers.clear() 127 | _init_logger() 128 | -------------------------------------------------------------------------------- /spec2vec/model_building.py: -------------------------------------------------------------------------------- 1 | """This module contains functions that will help users to train a word2vec model 2 | through gensim. 3 | """ 4 | import logging 5 | from typing import List, Tuple, Union 6 | import gensim 7 | from spec2vec.utils import ModelSaver, TrainingProgressLogger 8 | 9 | 10 | logger = logging.getLogger("spec2vec") 11 | 12 | 13 | def train_new_word2vec_model(documents: List, iterations: Union[List[int], int], filename: str = None, 14 | progress_logger: bool = True, **settings) -> gensim.models.Word2Vec: 15 | """Train a new Word2Vec model (using gensim). Save to file if filename is given. 16 | 17 | Example code on how to train a word2vec model on a corpus (=list of documents) 18 | that is derived from a given set of spectrums (list of matchms.Spectrum instances): 19 | 20 | .. code-block:: python 21 | 22 | from matchms import SpectrumDocument 23 | from spec2vec.model_building import train_new_word2vec_model 24 | 25 | documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums] 26 | model = train_new_word2vec_model(documents, iterations=20, size=200, 27 | workers=1, progress_logger=False) 28 | 29 | Parameters 30 | ---------- 31 | documents: 32 | List of documents, each document being a list of words (strings). 33 | iterations: 34 | Specifies the number of training interations. This can be done by setting 35 | iterations to the total number of training epochs (e.g. "iterations=15"), 36 | or by passing a list of iterations (e.g. "iterations=[5,10,15]") which will 37 | also led to a total training of max(iterations) epochs, but will save the 38 | model for every iteration in the list. Temporary models will be saved 39 | using the name: filename_TEMP_{#iteration}epoch.model". 40 | filename: str, 41 | Filename to save model. Default is None, which means no model will be saved. 42 | If a list of iterations is passed (e.g. "iterations=[5,10,15]"), then 43 | intermediate models will be saved during training (here after 5, 10 44 | iterations) using the pattern: filename_TEMP_{#iteration}epoch.model 45 | learning_rate_initial: 46 | Set initial learning rate. Default is 0.025. 47 | learning_rate_decay: 48 | After every epoch the learning rate will be lowered by the learning_rate_decay. 49 | Default is 0.00025. 50 | progress_logger: 51 | If True, the training progress will be printed every epoch. Default is True. 52 | **settings 53 | All other named arguments will be passed to the :py:class:`gensim.models.word2vec.Word2Vec` constructor. 54 | sg: int (0,1) 55 | For sg = 0 --> CBOW model, for sg = 1 --> skip gram model 56 | (see Gensim documentation). Default for Spec2Vec is 0. 57 | negative: int 58 | from Gensim: If > 0, negative sampling will be used, the int for 59 | negative specifies how many “noise words” should be drawn (usually 60 | between 5-20). If set to 0, no negative sampling is used. 61 | Default for Spec2Vec is 5. 62 | size: int, 63 | Dimensions of word vectors. Default is 300. 64 | window: int, 65 | Window size for context words (small for local context, larger for 66 | global context). Spec2Vec expects large windwos. Default is 500. 67 | min_count: int, 68 | Only consider words that occur at least min_count times in the corpus. 69 | Default is 1. 70 | workers: int, 71 | Number of threads to run the training on (should not be more than 72 | number of cores/threads. Default is 4. 73 | 74 | Returns 75 | ------- 76 | word2vec_model 77 | Gensim word2vec model. 78 | """ 79 | settings = set_spec2vec_defaults(**settings) 80 | 81 | num_of_epochs = max(iterations) if isinstance(iterations, list) else iterations 82 | 83 | # Convert spec2vec style arguments to gensim style arguments 84 | settings = learning_rates_to_gensim_style(num_of_epochs, **settings) 85 | 86 | # Set callbacks 87 | callbacks = [] 88 | if progress_logger: 89 | training_progress_logger = TrainingProgressLogger(num_of_epochs) 90 | callbacks.append(training_progress_logger) 91 | if filename: 92 | if isinstance(iterations, int): 93 | iterations = [iterations] 94 | model_saver = ModelSaver(num_of_epochs, iterations, filename) 95 | callbacks.append(model_saver) 96 | 97 | # Train word2vec model 98 | model = gensim.models.Word2Vec(documents, callbacks=callbacks, **settings) 99 | 100 | return model 101 | 102 | 103 | def set_spec2vec_defaults(**settings): 104 | """Set spec2vec default argument values"(where no user input is give)".""" 105 | defaults = { 106 | "sg": 0, 107 | "negative": 5, 108 | "vector_size": 300, 109 | "window": 500, 110 | "min_count": 1, 111 | "learning_rate_initial": 0.025, 112 | "learning_rate_decay": 0.00025, 113 | "workers": 4, 114 | "compute_loss": True, 115 | } 116 | assert "min_alpha" not in settings, "Expect 'learning_rate_decay' to describe learning rate decrease." 117 | assert "alpha" not in settings, "Expect 'learning_rate_initial' instead of 'alpha'." 118 | 119 | # Set default parameters or replace by **settings input 120 | for key, value in defaults.items(): 121 | if key in settings: 122 | msg = f"The value of {key} is set from {value} (default) to {settings[key]}" 123 | logger.info(msg) 124 | else: 125 | settings[key] = value 126 | return settings 127 | 128 | 129 | def learning_rates_to_gensim_style(num_of_epochs, **settings): 130 | """Convert "learning_rate_initial" and "learning_rate_decay" to gensim 131 | "alpha" and "min_alpha".""" 132 | alpha, min_alpha = set_learning_rate_decay(settings["learning_rate_initial"], 133 | settings["learning_rate_decay"], num_of_epochs) 134 | settings["alpha"] = alpha 135 | settings["min_alpha"] = min_alpha 136 | settings["epochs"] = num_of_epochs 137 | 138 | # Remove non-Gensim arguments from settings 139 | del settings["learning_rate_initial"] 140 | del settings["learning_rate_decay"] 141 | return settings 142 | 143 | 144 | def set_learning_rate_decay(learning_rate_initial: float, learning_rate_decay: float, 145 | num_of_epochs: int) -> Tuple[float, float]: 146 | """The learning rate in Gensim model training is defined by an initial rate 147 | (alpha) and a final rate (min_alpha). which can be unintuitive. Here those 148 | parameters will be set based on the given values for learning_rate_initial, 149 | num_of_epochs, and learning_rate_decay. 150 | 151 | Parameters 152 | ---------- 153 | learning_rate_initial: 154 | Set initial learning rate. 155 | learning_rate_decay: 156 | After evert epoch, the learning rate will be lowered by the learning_rate_decay. 157 | number_of_epochs: 158 | Total number of epochs for training. 159 | 160 | Returns: 161 | -------- 162 | alpha: 163 | Initial learning rate. 164 | min_alpha: 165 | Final learning rate. 166 | """ 167 | min_alpha = learning_rate_initial - num_of_epochs * learning_rate_decay 168 | if min_alpha < 0: 169 | msg = ("Number of total iterations is too high for given learning_rate decay.", 170 | f"Learning_rate_decay will be set from {learning_rate_decay} ", 171 | "to {learning_rate_initial/num_of_epochs}.") 172 | logger.warning(msg) 173 | min_alpha = 0 174 | return learning_rate_initial, min_alpha 175 | -------------------------------------------------------------------------------- /spec2vec/serialization/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for exporting and importing trained :class:`~gensim.models.Word2Vec` model to and from disk. 3 | ########################################## 4 | Functions provide the ability to export and import trained :class:`~gensim.models.Word2Vec` model to and from disk 5 | without pickling the model. The model can be stored in two files: `.json` for metadata and `.npy` for weights. 6 | """ 7 | from .model_exporting import export_model 8 | from .model_importing import Word2VecLight, import_model 9 | 10 | 11 | __all__ = [ 12 | "export_model", 13 | "import_model", 14 | "Word2VecLight" 15 | ] 16 | -------------------------------------------------------------------------------- /spec2vec/serialization/model_exporting.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from copy import deepcopy 4 | from typing import Union 5 | import numpy as np 6 | import scipy.sparse 7 | from gensim.models import Word2Vec 8 | 9 | 10 | def export_model(model: Word2Vec, 11 | output_model_file: Union[str, os.PathLike], 12 | output_weights_file: Union[str, os.PathLike]): 13 | """ 14 | Write a lightweight version of a :class:`~gensim.model.Word2Vec` model to disk. Such a model can be read to 15 | calculate scores but is not capable of further training. 16 | 17 | Parameters 18 | ---------- 19 | model: 20 | :class:`~gensim.models.Word2Vec` trained model. 21 | output_model_file: 22 | A path of json file to save the model. 23 | output_weights_file: 24 | A path of `.npy` file to save the model's weights. 25 | """ 26 | model = deepcopy(model) 27 | keyedvectors = extract_keyedvectors(model) 28 | weights = keyedvectors.pop("vectors") 29 | keyedvectors["__weights_format"] = get_weights_format(weights) 30 | 31 | save_model(keyedvectors, output_model_file) 32 | save_weights(weights, output_weights_file) 33 | 34 | 35 | def save_weights(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix], 36 | output_weights_file: Union[str, os.PathLike]): 37 | """ 38 | Write model's weights to disk in `.npy` dense array format. If the weights are sparse, they are converted to dense 39 | prior to saving. 40 | """ 41 | if isinstance(weights, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)): 42 | weights = weights.toarray() 43 | 44 | np.save(output_weights_file, weights, allow_pickle=False) 45 | 46 | 47 | def save_model(keyedvectors: dict, output_model_file: Union[str, os.PathLike]): 48 | """Write model's metadata to disk in json format.""" 49 | with open(output_model_file, "w", encoding="utf-8") as f: 50 | json.dump(keyedvectors, f) 51 | 52 | 53 | def get_weights_format(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]) -> str: 54 | """ 55 | Get the array format of the model's weights. 56 | 57 | Parameters 58 | ---------- 59 | weights: 60 | Model's weights. 61 | 62 | Returns 63 | ------- 64 | weights_format: 65 | Format of the model's weights. 66 | """ 67 | if isinstance(weights, np.ndarray): 68 | return "np.ndarray" 69 | if isinstance(weights, scipy.sparse.csr_matrix): 70 | return "csr_matrix" 71 | if isinstance(weights, scipy.sparse.csc_matrix): 72 | return "csc_matrix" 73 | raise NotImplementedError("The model's weights format is not supported.") 74 | 75 | 76 | def extract_keyedvectors(model: Word2Vec) -> dict: 77 | """ 78 | Extract :class:`~gensim.models.KeyedVectors` object from the model, convert it to a dictionary and 79 | remove redundant keys. 80 | 81 | Parameters 82 | ---------- 83 | model: 84 | :class:`~gensim.models.Word2Vec` trained model. 85 | 86 | Returns 87 | ------- 88 | keyedvectors: 89 | Dictionary representation of :class:`~gensim.models.KeyedVectors` without redundant keys. 90 | """ 91 | keyedvectors = model.wv.__dict__ 92 | keyedvectors.pop("vectors_lockf", None) 93 | keyedvectors.pop("expandos", None) 94 | return keyedvectors 95 | -------------------------------------------------------------------------------- /spec2vec/serialization/model_importing.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Union 4 | import numpy as np 5 | import scipy.sparse 6 | from gensim.models import KeyedVectors 7 | 8 | 9 | class Word2VecLight: 10 | """ 11 | A lightweight version of :class:`~gensim.models.Word2Vec`. The objects of this class follow the interface of the 12 | original :class:`~gensim.models.Word2Vec` to the point necessary to calculate Spec2Vec scores. The model cannot be 13 | used for further training. 14 | """ 15 | 16 | def __init__(self, model: dict, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): 17 | """ 18 | 19 | Parameters 20 | ---------- 21 | model: 22 | A dictionary containing the model's metadata. 23 | weights: 24 | A numpy array or a scipy sparse matrix containing the model's weights. 25 | """ 26 | self.wv: KeyedVectors = self._KeyedVectorsBuilder().from_dict(model).with_weights(weights).build() 27 | 28 | class _KeyedVectorsBuilder: 29 | def __init__(self): 30 | self.vector_size = None 31 | self.weights = None 32 | 33 | def build(self) -> KeyedVectors: 34 | keyed_vectors = KeyedVectors(self.vector_size) 35 | keyed_vectors.__dict__ = self.__dict__ 36 | keyed_vectors.vectors = self.weights 37 | return keyed_vectors 38 | 39 | def from_dict(self, dictionary: dict): 40 | expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads", 41 | "index_to_key", "norms", "key_to_index", "__weights_format"} 42 | if dictionary.keys() == expected_keys: 43 | self.__dict__ = dictionary 44 | elif expected_keys.symmetric_difference(dictionary.keys()) == {"next_index"}: # backward compatibility 45 | dictionary.pop("next_index") 46 | self.__dict__ = dictionary 47 | else: 48 | raise ValueError("The keys of model's dictionary representation do not match the expected keys.") 49 | return self 50 | 51 | def with_weights(self, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): 52 | self.weights = weights 53 | return self 54 | 55 | 56 | def import_model(model_file, weights_file) -> Word2VecLight: 57 | """ 58 | Read a lightweight version of a :class:`~gensim.models.Word2Vec` model from disk. 59 | 60 | Parameters 61 | ---------- 62 | model_file: 63 | A path of json file to load the model. 64 | weights_file: 65 | A path of `.npy` file to load the model's weights. 66 | 67 | Returns 68 | ------- 69 | :class:`~spec2vec.serialization.model_importing.Word2VecLight` – a lightweight version of a 70 | :class:`~gensim.models.Word2Vec` 71 | """ 72 | with open(model_file, "r", encoding="utf-8") as f: 73 | model: dict = json.load(f) 74 | 75 | weights = load_weights(weights_file, model["__weights_format"]) 76 | return Word2VecLight(model, weights) 77 | 78 | 79 | def load_weights(weights_file: Union[str, os.PathLike], 80 | weights_format: str) -> Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]: 81 | weights: np.ndarray = np.load(weights_file, allow_pickle=False) 82 | 83 | weights_array_builder = {"csr_matrix": scipy.sparse.csr_matrix, 84 | "csc_matrix": scipy.sparse.csc_matrix, 85 | "np.ndarray": lambda x: x} 86 | weights = weights_array_builder[weights_format](weights) 87 | 88 | return weights 89 | -------------------------------------------------------------------------------- /spec2vec/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from gensim.models.callbacks import CallbackAny2Vec 3 | 4 | 5 | class TrainingProgressLogger(CallbackAny2Vec): 6 | """Callback to log training progress.""" 7 | 8 | def __init__(self, num_of_epochs: int): 9 | """ 10 | 11 | Parameters 12 | ---------- 13 | num_of_epochs: 14 | Total number of training epochs. 15 | """ 16 | self.epoch = 0 17 | self.num_of_epochs = num_of_epochs 18 | self.loss = 0 19 | 20 | def on_epoch_end(self, model): 21 | """Return progress of model training""" 22 | loss = model.get_latest_training_loss() 23 | 24 | print('\r', 25 | ' Epoch ' + str(self.epoch+1) + ' of ' + str(self.num_of_epochs) + '.', 26 | end="") 27 | print(f'Change in loss after epoch {self.epoch + 1}: {loss - self.loss}') 28 | self.epoch += 1 29 | self.loss = loss 30 | 31 | 32 | class ModelSaver(CallbackAny2Vec): 33 | """Callback to save model during training (when specified).""" 34 | 35 | def __init__(self, num_of_epochs: int, iterations: List, filename: str): 36 | """ 37 | 38 | Parameters 39 | ---------- 40 | num_of_epochs: 41 | Total number of training epochs. 42 | iterations: 43 | Number of total iterations or list of iterations at which to save the 44 | model. 45 | filename: 46 | Filename to save model. 47 | """ 48 | self.epoch = 0 49 | self.num_of_epochs = num_of_epochs 50 | self.iterations = iterations 51 | self.filename = filename 52 | 53 | def on_epoch_end(self, model): 54 | """Allow saving model during training when specified in iterations.""" 55 | self.epoch += 1 56 | 57 | if self.filename and self.epoch in self.iterations: 58 | if self.epoch < self.num_of_epochs: 59 | filename = f"{self.filename.split('.model')[0]}_iter_{self.epoch}.model" 60 | else: 61 | filename = self.filename 62 | print("Saving model with name:", filename) 63 | model.save(filename) 64 | -------------------------------------------------------------------------------- /spec2vec/vector_operations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Union 3 | import numba 4 | import numpy as np 5 | from gensim.models.basemodel import BaseTopicModel 6 | from spec2vec.Document import Document 7 | 8 | 9 | logger = logging.getLogger("spec2vec") 10 | 11 | 12 | def calc_vector(model: BaseTopicModel, document: Document, 13 | intensity_weighting_power: Union[float, int] = 0, 14 | allowed_missing_percentage: Union[float, int] = 10) -> np.ndarray: 15 | """Compute document vector as a (weighted) sum of individual word vectors. 16 | 17 | Parameters 18 | ---------- 19 | model 20 | Pretrained word2vec model to convert words into vectors. 21 | document 22 | Document containing document.words and document.weights. 23 | intensity_weighting_power 24 | Specify to what power weights should be raised. The default is 0, which 25 | means that no weighing will be done. 26 | allowed_missing_percentage: 27 | Set the maximum allowed percentage of the document that may be missing 28 | from the input model. This is measured as percentage of the weighted, missing 29 | words compared to all word vectors of the document. Default is 10, which 30 | means up to 10% missing words are allowed. If more words are missing from 31 | the model, an empty embedding will be returned (leading to similarities of 0) 32 | and a warning is raised. 33 | 34 | Returns 35 | ------- 36 | vector 37 | Vector representing the input document in latent space. Will return None 38 | if the missing percentage of the document in the model is > allowed_missing_percentage. 39 | """ 40 | assert max(document.weights) <= 1.0, "Weights are not normalized to unity as expected." 41 | assert 0 <= allowed_missing_percentage <= 100.0, "allowed_missing_percentage must be within [0,100]" 42 | 43 | def _check_model_coverage(): 44 | """Return True if model covers enough of the document words.""" 45 | if len(idx_not_in_model) > 0: 46 | weights_missing = np.array([document.weights[i] for i in idx_not_in_model]) 47 | weights_missing_raised = np.power(weights_missing, intensity_weighting_power) 48 | missing_percentage = 100 * weights_missing_raised.sum() / (weights_raised.sum() 49 | + weights_missing_raised.sum()) 50 | msg = (f"Found {len(idx_not_in_model)} word(s) missing in the model.", 51 | f"Weighted missing percentage not covered by the given model is {missing_percentage:.2f}%.") 52 | logger.info(msg) 53 | 54 | if missing_percentage > allowed_missing_percentage: 55 | msg = (f"Missing percentage ({missing_percentage:.2f}%) is above set maximum. An empty vector will be returned.", 56 | "Consider retraining the used model or change the `allowed_missing_percentage`.") 57 | logger.warning(msg) 58 | return False 59 | return True 60 | 61 | idx_not_in_model = [i for i, x in enumerate(document.words) if x not in model.wv.key_to_index] 62 | if len(idx_not_in_model) == len(document.words): 63 | msg = ("Spectrum without peaks known by the used model. An empty vector will be returned.", 64 | "Consider retraining the used model or make sure decimal rounding is correct.") 65 | logger.warning(msg) 66 | return np.zeros(model.wv.vector_size) 67 | 68 | words_in_model = [x for i, x in enumerate(document.words) if i not in idx_not_in_model] 69 | weights_in_model = np.asarray([x for i, x in enumerate(document.weights) 70 | if i not in idx_not_in_model]).reshape(len(words_in_model), 1) 71 | 72 | word_vectors = model.wv[words_in_model] 73 | weights_raised = np.power(weights_in_model, intensity_weighting_power) 74 | 75 | if _check_model_coverage() is True: 76 | weights_raised_tiled = np.tile(weights_raised, (1, model.wv.vector_size)) 77 | return np.sum(word_vectors * weights_raised_tiled, 0) 78 | return np.zeros(model.wv.vector_size) 79 | 80 | 81 | @numba.njit 82 | def cosine_similarity_matrix(vectors_1: np.ndarray, vectors_2: np.ndarray) -> np.ndarray: 83 | """Fast implementation of cosine similarity between two arrays of vectors. 84 | 85 | For example: 86 | 87 | .. code-block:: python 88 | 89 | import numpy as np 90 | from spec2vec.vector_operations import cosine_similarity_matrix 91 | 92 | vectors_1 = np.array([[1, 1, 0, 0], 93 | [1, 0, 1, 1]]) 94 | vectors_2 = np.array([[0, 1, 1, 0], 95 | [0, 0, 1, 1]]) 96 | similarity_matrix = cosine_similarity_matrix(vectors_1, vectors_2) 97 | 98 | 99 | Parameters 100 | ---------- 101 | vectors_1 102 | Numpy array of vectors. vectors_1.shape[0] is number of vectors, vectors_1.shape[1] 103 | is vector dimension. 104 | vectors_2 105 | Numpy array of vectors. vectors_2.shape[0] is number of vectors, vectors_2.shape[1] 106 | is vector dimension. 107 | """ 108 | assert vectors_1.shape[1] == vectors_2.shape[1], "Input vectors must have same shape." 109 | vectors_1 = vectors_1.astype(np.float64) # Numba dot only accepts float or complex arrays 110 | vectors_2 = vectors_2.astype(np.float64) 111 | norm_1 = np.sqrt(np.sum(vectors_1**2, axis=1)) 112 | norm_2 = np.sqrt(np.sum(vectors_2**2, axis=1)) 113 | for i in range(vectors_1.shape[0]): 114 | vectors_1[i] = vectors_1[i] / norm_1[i] 115 | for i in range(vectors_2.shape[0]): 116 | vectors_2[i] = vectors_2[i] / norm_2[i] 117 | return np.dot(vectors_1, vectors_2.T) 118 | 119 | 120 | @numba.njit 121 | def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> np.float64: 122 | """Calculate cosine similarity between two input vectors. 123 | 124 | For example: 125 | 126 | .. testcode:: 127 | 128 | import numpy as np 129 | from spec2vec.vector_operations import cosine_similarity 130 | 131 | vector1 = np.array([1, 1, 0, 0]) 132 | vector2 = np.array([1, 1, 1, 1]) 133 | print("Cosine similarity: {:.3f}".format(cosine_similarity(vector1, vector2))) 134 | 135 | Should output 136 | 137 | .. testoutput:: 138 | 139 | Cosine similarity: 0.707 140 | 141 | Parameters 142 | ---------- 143 | vector1 144 | Input vector. Can be array of integers or floats. 145 | vector2 146 | Input vector. Can be array of integers or floats. 147 | """ 148 | assert vector1.shape[0] == vector2.shape[0], "Input vector must have same shape." 149 | prod12 = 0 150 | prod11 = 0 151 | prod22 = 0 152 | for i in range(vector1.shape[0]): 153 | prod12 += vector1[i] * vector2[i] 154 | prod11 += vector1[i] * vector1[i] 155 | prod22 += vector2[i] * vector2[i] 156 | cosine_score = 0 157 | if prod11 != 0 and prod22 != 0: 158 | cosine_score = prod12 / np.sqrt(prod11 * prod22) 159 | return np.float64(cosine_score) 160 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | 4 | 5 | @pytest.fixture(scope="module") 6 | def test_dir(request): 7 | """Return the directory of the currently running test script.""" 8 | return Path(request.fspath).parent 9 | -------------------------------------------------------------------------------- /tests/data/weights.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/tests/data/weights.npy -------------------------------------------------------------------------------- /tests/test_document.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spec2vec.Document import Document 3 | 4 | 5 | def test_document_init(): 6 | obj = "asdasd" 7 | document = Document(obj=obj) 8 | assert len(document) == 0 9 | 10 | 11 | def test_document_raises_stop_iteration(): 12 | obj = "asdasd" 13 | document = Document(obj=obj) 14 | with pytest.raises(StopIteration): 15 | next(document) 16 | -------------------------------------------------------------------------------- /tests/test_logging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | from spec2vec.logging_functions import (add_logging_to_file, 5 | reset_spec2vec_logger, 6 | set_spec2vec_logger_level) 7 | 8 | 9 | def test_initial_logging(caplog, capsys): 10 | """Test logging functionality.""" 11 | reset_spec2vec_logger() 12 | logger = logging.getLogger("spec2vec") 13 | logger.info("info test") 14 | logger.warning("warning test") 15 | assert logger.name == "spec2vec", "Expected different logger name" 16 | assert logger.getEffectiveLevel() == 30, "Expected different logging level" 17 | assert "info test" not in caplog.text, "Info log should not be shown." 18 | assert "warning test" in caplog.text, "Warning log should have been shown." 19 | assert "warning test" in capsys.readouterr().out, \ 20 | "Warning log should have been shown to stderr/stdout." 21 | reset_spec2vec_logger() 22 | 23 | 24 | def test_set_and_reset_spec2vec_logger_level(caplog): 25 | """Test logging functionality.""" 26 | logger = logging.getLogger("spec2vec") 27 | assert logger.getEffectiveLevel() == 30, "Expected different logging level" 28 | 29 | set_spec2vec_logger_level("INFO") 30 | logger.debug("debug test") 31 | logger.info("info test") 32 | 33 | assert logger.name == "spec2vec", "Expected different logger name" 34 | assert logger.getEffectiveLevel() == 20, "Expected different logging level" 35 | assert "debug test" not in caplog.text, "Debug log should not be shown." 36 | assert "info test" in caplog.text, "Info log should have been shown." 37 | 38 | reset_spec2vec_logger() 39 | assert logger.getEffectiveLevel() == 30, "Expected different logging level" 40 | reset_spec2vec_logger() 41 | 42 | 43 | def test_add_logging_to_file(tmp_path, caplog, capsys): 44 | """Test writing logs to file.""" 45 | reset_spec2vec_logger() 46 | set_spec2vec_logger_level("INFO") 47 | filename = os.path.join(tmp_path, "test.log") 48 | add_logging_to_file(filename) 49 | logger = logging.getLogger("spec2vec") 50 | logger.info("test message no.1") 51 | 52 | expected_log_entry = "test message no.1" 53 | # Test streamed logs 54 | assert expected_log_entry in caplog.text, "Expected different log message." 55 | assert expected_log_entry in capsys.readouterr().out, \ 56 | "Expected different log message in output (stdout/stderr)." 57 | 58 | # Test log file 59 | expected_log_entry = "INFO:spec2vec:test_logging:test message no.1" 60 | assert len(logger.handlers) == 2, "Expected two Handler" 61 | assert os.path.isfile(filename), "Log file not found." 62 | with open(filename, "r", encoding="utf-8") as file: 63 | logs = file.read() 64 | assert expected_log_entry in logs, "Expected different log file content" 65 | reset_spec2vec_logger() 66 | 67 | 68 | def test_add_logging_to_file_only_file(tmp_path, capsys): 69 | """Test writing logs to file.""" 70 | reset_spec2vec_logger() 71 | set_spec2vec_logger_level("INFO") 72 | filename = os.path.join(tmp_path, "test.log") 73 | add_logging_to_file(filename, remove_stream_handlers=True) 74 | logger = logging.getLogger("spec2vec") 75 | logger.info("test message no.1") 76 | 77 | # Test streamed logs 78 | not_expected_log_entry = "test message no.1" 79 | assert len(logger.handlers) == 1, "Expected only one Handler" 80 | assert not_expected_log_entry not in capsys.readouterr().out, "Did not expect log message" 81 | 82 | # Test log file 83 | expected_log_entry = "INFO:spec2vec:test_logging:test message no.1" 84 | assert os.path.isfile(filename), "Log file not found." 85 | with open(filename, "r", encoding="utf-8") as file: 86 | logs = file.read() 87 | assert expected_log_entry in logs, "Expected different log file content" 88 | reset_spec2vec_logger() 89 | -------------------------------------------------------------------------------- /tests/test_model_building.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim 3 | import numpy as np 4 | import pytest 5 | from matchms import Spectrum 6 | from spec2vec import SpectrumDocument 7 | from spec2vec.model_building import (set_learning_rate_decay, 8 | train_new_word2vec_model) 9 | 10 | 11 | def test_set_learning_rate_decay(): 12 | """Test if correct alpha and min_alpha are calculated.""" 13 | alpha, min_alpha = set_learning_rate_decay(0.5, 0.05, 8) 14 | assert alpha == 0.5, "Expected different alpha." 15 | assert min_alpha == 0.5 - 8 * 0.05, "Expected different min_alpha" 16 | 17 | 18 | def test_set_learning_rate_decay_rate_too_high(): 19 | """Test if correct alpha and min_alpha are calculated if rate is too high.""" 20 | alpha, min_alpha = set_learning_rate_decay(0.5, 0.05, 20) 21 | assert alpha == 0.5, "Expected different alpha." 22 | assert min_alpha == 0.0, "Expected different min_alpha" 23 | 24 | 25 | def test_train_new_word2vec_model(): 26 | """Test training of a dummy model.""" 27 | # Create fake corpus 28 | documents = [] 29 | for i in range(100): 30 | spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), 31 | intensities=np.ones((10)).astype("float"), 32 | metadata={}) 33 | documents.append(SpectrumDocument(spectrum, n_decimals=1)) 34 | model = train_new_word2vec_model(documents, iterations=20, vector_size=20, 35 | progress_logger=False) 36 | assert model.sg == 0, "Expected different default value." 37 | assert model.negative == 5, "Expected different default value." 38 | assert model.window == 500, "Expected different default value." 39 | assert model.alpha == 0.025, "Expected different default value." 40 | assert model.min_alpha == 0.02, "Expected different default value." 41 | assert model.epochs == 20, "Expected differnt number of epochs." 42 | assert model.wv.vector_size == 20, "Expected differnt vector size." 43 | assert len(model.wv) == 109, "Expected different number of words in vocab." 44 | assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size." 45 | 46 | 47 | def test_train_new_word2vec_model_with_logger_and_saving(tmp_path): 48 | """Test training of a dummy model and save it.""" 49 | # Create fake corpus 50 | documents = [] 51 | for i in range(100): 52 | spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), 53 | intensities=np.ones((10)).astype("float"), 54 | metadata={}) 55 | documents.append(SpectrumDocument(spectrum, n_decimals=1)) 56 | # Train model and write to file 57 | filename = os.path.join(tmp_path, "test.model") 58 | model = train_new_word2vec_model(documents, iterations=20, filename=filename, 59 | vector_size=20, progress_logger=True) 60 | 61 | # Test if file exists 62 | assert os.path.isfile(filename), "Could not find saved model file." 63 | 64 | # Test if saved model seems to be correct 65 | model = gensim.models.Word2Vec.load(filename) 66 | assert model.sg == 0, "Expected different default value." 67 | assert model.negative == 5, "Expected different default value." 68 | assert model.window == 500, "Expected different default value." 69 | assert model.alpha == 0.025, "Expected different default value." 70 | assert model.min_alpha == 0.02, "Expected different default value." 71 | assert model.epochs == 20, "Expected differnt number of epochs." 72 | assert model.wv.vector_size == 20, "Expected differnt vector size." 73 | assert len(model.wv) == 109, "Expected different number of words in vocab." 74 | assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size." 75 | 76 | 77 | def test_train_new_word2vec_model_wrong_entry(): 78 | """Test training of a dummy model with not-accepted gensim argument entry.""" 79 | # Create fake corpus 80 | documents = [] 81 | for i in range(10): 82 | spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), 83 | intensities=np.ones((10)).astype("float"), 84 | metadata={}) 85 | documents.append(SpectrumDocument(spectrum, n_decimals=1)) 86 | 87 | with pytest.raises(AssertionError) as msg: 88 | _ = train_new_word2vec_model(documents, iterations=20, alpha=0.01, 89 | progress_logger=False) 90 | 91 | expected_message_part = "Expect 'learning_rate_initial' instead of 'alpha'." 92 | assert expected_message_part in str(msg.value), "Expected particular error message." 93 | -------------------------------------------------------------------------------- /tests/test_model_serialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import MagicMock, patch 3 | import numpy as np 4 | import pytest 5 | from gensim.models import Word2Vec 6 | from matchms import Spectrum, calculate_scores 7 | from scipy.sparse import coo_matrix, csc_matrix, csr_matrix 8 | from spec2vec import Spec2Vec 9 | from spec2vec.serialization import Word2VecLight, export_model, import_model 10 | 11 | 12 | @pytest.fixture(params=["numpy", "scipy_csr", "scipy_csc"]) 13 | def model(request, test_dir): 14 | model_file = os.path.join(test_dir, "..", "integration-tests", "test_user_workflow_spec2vec.model") 15 | model = Word2Vec.load(model_file) 16 | 17 | if request.param in ["scipy_csc", "scipy_csr"]: 18 | scipy_matrix_builder = {"scipy_csr": csr_matrix, "scipy_csc": csc_matrix} 19 | model.wv.__numpys, model.wv.__ignoreds = [], [] 20 | model.wv.__scipys = ["vectors"] # pylint:disable=protected-access 21 | model.wv.vectors = scipy_matrix_builder[request.param](model.wv.vectors) 22 | return model 23 | 24 | 25 | def write_read_model(model, tmp_path): 26 | model_file = tmp_path / "model.json" 27 | weights_file = tmp_path / "weights.npy" 28 | export_model(model, model_file, weights_file) 29 | 30 | model = import_model(model_file, weights_file) 31 | return model 32 | 33 | 34 | def test_write_model_to_disk(model, tmp_path): 35 | model_file = tmp_path / "model.json" 36 | weights_file = tmp_path / "weights.npy" 37 | export_model(model, model_file, weights_file) 38 | 39 | assert os.path.isfile(model_file) 40 | assert os.path.isfile(weights_file) 41 | 42 | 43 | def test_read_model_from_disk(test_dir): 44 | model_file = os.path.join(test_dir, "data", "model.json") 45 | weights_file = os.path.join(test_dir, "data", "weights.npy") 46 | model = import_model(model_file, weights_file) 47 | 48 | assert isinstance(model, Word2VecLight) 49 | 50 | 51 | def test_model_metadata_integrity(model, tmp_path): 52 | imported_model = write_read_model(model, tmp_path) 53 | 54 | assert imported_model.wv.vector_size == model.wv.vector_size 55 | assert imported_model.wv.key_to_index == model.wv.key_to_index 56 | assert imported_model.wv.index_to_key == model.wv.index_to_key 57 | assert imported_model.wv.__scipys == model.wv.__scipys # pylint:disable=protected-access 58 | assert imported_model.wv.__numpys == model.wv.__numpys # pylint:disable=protected-access 59 | assert imported_model.wv.__ignoreds == model.wv.__ignoreds # pylint:disable=protected-access 60 | 61 | 62 | @pytest.mark.parametrize("model", ["numpy"], indirect=True) 63 | def test_dense_weights_integrity(model, tmp_path): 64 | imported_model = write_read_model(model, tmp_path) 65 | 66 | assert (imported_model.wv.vectors == model.wv.vectors).all() 67 | 68 | 69 | @pytest.mark.parametrize("model", ["scipy_csr", "scipy_csc"], indirect=True) 70 | def test_sparse_weights_integrity(model, tmp_path): 71 | imported_model = write_read_model(model, tmp_path) 72 | 73 | assert (imported_model.wv.vectors.toarray() == model.wv.vectors.toarray()).all() 74 | 75 | 76 | @patch("json.load", MagicMock(return_value={"unexpected_key": "value", "__weights_format": "np.ndarray"})) 77 | def test_reading_model_with_wrong_keys_fails(test_dir): 78 | model_file = os.path.join(test_dir, "data", "model.json") 79 | weights_file = os.path.join(test_dir, "data", "weights.npy") 80 | 81 | with pytest.raises(ValueError) as error: 82 | import_model(model_file, weights_file) 83 | 84 | assert str(error.value) == "The keys of model's dictionary representation do not match the expected keys." 85 | 86 | 87 | def test_writing_model_with_wrong_weights_format_fails(model): 88 | model.wv.vectors = coo_matrix(model.wv.vectors) 89 | 90 | with pytest.raises(NotImplementedError) as error: 91 | export_model(model, "model.json", "weights.npy") 92 | 93 | assert str(error.value) == "The model's weights format is not supported." 94 | 95 | 96 | @pytest.mark.parametrize("model", ["numpy"], indirect=True) # calculate_scores supports only numpy arrays 97 | def test_reloaded_model_computes_scores(model, tmp_path): 98 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 99 | intensities=np.array([0.7, 0.2, 0.1]), 100 | metadata={'id': 'spectrum1'}) 101 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 102 | intensities=np.array([0.4, 0.2, 0.1]), 103 | metadata={'id': 'spectrum2'}) 104 | spectrum_3 = Spectrum(mz=np.array([110, 140, 180.]), 105 | intensities=np.array([0.4, 0.3, 0.1]), 106 | metadata={'id': 'spectrum3'}) 107 | 108 | queries = [spectrum_1, spectrum_2] 109 | references = [spectrum_1, spectrum_2, spectrum_3] 110 | 111 | reloaded_model = write_read_model(model, tmp_path) 112 | spec2vec_reloaded = Spec2Vec(reloaded_model, intensity_weighting_power=0.5) 113 | spec2vec = Spec2Vec(model, intensity_weighting_power=0.5) 114 | 115 | scores = list(calculate_scores(references, queries, spec2vec)) 116 | scores_reloaded = list(calculate_scores(references, queries, spec2vec_reloaded)) 117 | 118 | assert scores == scores_reloaded 119 | -------------------------------------------------------------------------------- /tests/test_spec2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim 3 | import numpy as np 4 | import pytest 5 | from matchms import Spectrum 6 | from spec2vec import Spec2Vec, SpectrumDocument 7 | 8 | 9 | def test_spec2vec_pair_method_spectrum_entry(): 10 | """Test if pair of two Spectrums is handled correctly""" 11 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 12 | intensities=np.array([0.7, 0.2, 0.1]), 13 | metadata={'id': 'spectrum1'}) 14 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 15 | intensities=np.array([0.4, 0.2, 0.1]), 16 | metadata={'id': 'spectrum2'}) 17 | 18 | model = load_test_model() 19 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 20 | score01 = spec2vec.pair(spectrum_1, spectrum_2) 21 | assert score01 == pytest.approx(0.9936808, 1e-6) 22 | score11 = spec2vec.pair(spectrum_2, spectrum_2) 23 | assert score11 == pytest.approx(1.0, 1e-9) 24 | 25 | 26 | def test_spec2vec_pair_method_spectrumdocument_entry(): 27 | """Test if pair of two SpectrumDocuments is handled correctly""" 28 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 29 | intensities=np.array([0.7, 0.2, 0.1]), 30 | metadata={'id': 'spectrum1'}) 31 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 32 | intensities=np.array([0.4, 0.2, 0.1]), 33 | metadata={'id': 'spectrum2'}) 34 | 35 | documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] 36 | model = load_test_model() 37 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 38 | score01 = spec2vec.pair(documents[0], documents[1]) 39 | assert score01 == pytest.approx(0.9936808, 1e-6) 40 | score11 = spec2vec.pair(documents[1], documents[1]) 41 | assert score11 == pytest.approx(1.0, 1e-9) 42 | 43 | 44 | def test_spec2vec_pair_method_none_entry(): 45 | """Test if wrong input data raises expected exception""" 46 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 47 | intensities=np.array([0.7, 0.2, 0.1]), 48 | metadata={'id': 'spectrum1'}) 49 | spectrum_2 = None 50 | model = load_test_model() 51 | spec2vec = Spec2Vec(model=model) 52 | with pytest.raises(ValueError) as msg: 53 | _ = spec2vec.pair(spectrum_1, spectrum_2) 54 | 55 | expected_msg = "Expected input type to be Spectrum or SpectrumDocument" 56 | assert expected_msg in str(msg), "Expected different exception" 57 | 58 | 59 | def test_spec2vec_pair_method_wrong_spectrumdocument_entry(): 60 | """Test if SpectrumDocuments with different decimal rounding is handled correctly""" 61 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 62 | intensities=np.array([0.7, 0.2, 0.1]), 63 | metadata={'id': 'spectrum1'}) 64 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 65 | intensities=np.array([0.4, 0.2, 0.1]), 66 | metadata={'id': 'spectrum2'}) 67 | 68 | documents = [SpectrumDocument(s, n_decimals=2) for s in [spectrum_1, spectrum_2]] 69 | model = load_test_model() 70 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 71 | with pytest.raises(AssertionError) as msg: 72 | _ = spec2vec.pair(documents[0], documents[1]) 73 | 74 | expected_msg = "Decimal rounding of input data does not agree with model vocabulary." 75 | assert expected_msg in str(msg), "Expected different exception" 76 | 77 | 78 | @pytest.mark.parametrize("progress_bar", [True, False]) 79 | def test_spec2vec_matrix_method(progress_bar): 80 | """Test if matrix of 2x2 SpectrumDocuments is handled correctly. 81 | Run with and without progress bar. 82 | """ 83 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 84 | intensities=np.array([0.7, 0.2, 0.1]), 85 | metadata={'id': 'spectrum1'}) 86 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 87 | intensities=np.array([0.4, 0.2, 0.1]), 88 | metadata={'id': 'spectrum2'}) 89 | 90 | documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] 91 | model = load_test_model() 92 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, progress_bar=progress_bar) 93 | scores = spec2vec.matrix(documents, documents) 94 | assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." 95 | assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." 96 | assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." 97 | assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." 98 | 99 | 100 | def test_spec2vec_matrix_method_symmetric_spectrum_entry(): 101 | """Test if matrix of 2x2 Spectrums is handled correctly. 102 | Run with is_symmetric=True. 103 | """ 104 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 105 | intensities=np.array([0.7, 0.2, 0.1]), 106 | metadata={'id': 'spectrum1'}) 107 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 108 | intensities=np.array([0.4, 0.2, 0.1]), 109 | metadata={'id': 'spectrum2'}) 110 | 111 | spectrums = [spectrum_1, spectrum_2] 112 | model = load_test_model() 113 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 114 | scores = spec2vec.matrix(spectrums, spectrums, is_symmetric=True) 115 | assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." 116 | assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." 117 | assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." 118 | assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." 119 | 120 | 121 | def test_spec2vec_matrix_method_symmetric_spectrumdocument_entry(): 122 | """Test if matrix of 2x2 SpectrumDocuments is handled correctly. 123 | Run with is_symmetric=True. 124 | """ 125 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 126 | intensities=np.array([0.7, 0.2, 0.1]), 127 | metadata={'id': 'spectrum1'}) 128 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 129 | intensities=np.array([0.4, 0.2, 0.1]), 130 | metadata={'id': 'spectrum2'}) 131 | 132 | documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] 133 | model = load_test_model() 134 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 135 | scores = spec2vec.matrix(documents, documents, is_symmetric=True) 136 | assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." 137 | assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." 138 | assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." 139 | assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." 140 | 141 | 142 | def test_spec2vec_matrix_method_symmetric_wrong_entry(): 143 | """Test if matrix of 2x2 SpectrumDocuments is handled correctly. 144 | Run with is_symmetric=True but non symmetric entries. 145 | """ 146 | spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), 147 | intensities=np.array([0.7, 0.2, 0.1]), 148 | metadata={'id': 'spectrum1'}) 149 | spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), 150 | intensities=np.array([0.4, 0.2, 0.1]), 151 | metadata={'id': 'spectrum2'}) 152 | 153 | documents1 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] 154 | documents2 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_2, spectrum_1]] 155 | model = load_test_model() 156 | spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) 157 | expected_msg = "Expected references to be equal to queries for is_symmetric=True" 158 | with pytest.raises(AssertionError) as msg: 159 | _ = spec2vec.matrix(documents1, documents2, is_symmetric=True) 160 | assert expected_msg in str(msg), "Expected different exception message" 161 | 162 | 163 | def load_test_model(): 164 | """Load pretrained Word2Vec model.""" 165 | repository_root = os.path.join(os.path.dirname(__file__), "..") 166 | model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") 167 | assert os.path.isfile(model_file), "Expected file not found." 168 | return gensim.models.Word2Vec.load(model_file) 169 | -------------------------------------------------------------------------------- /tests/test_spectrum_document.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from matchms import Spectrum 4 | from matchms.filtering import add_losses 5 | from spec2vec import SpectrumDocument 6 | 7 | 8 | def test_spectrum_document_init_n_decimals_default_value_no_losses(): 9 | 10 | mz = np.array([10, 20, 30, 40], dtype="float") 11 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 12 | metadata = dict(precursor_mz=100.0) 13 | spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 14 | spectrum_document = SpectrumDocument(spectrum) 15 | 16 | assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals" 17 | assert len(spectrum_document) == 4 18 | assert spectrum_document.words == [ 19 | "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00" 20 | ] 21 | assert next(spectrum_document) == "peak@10.00" 22 | 23 | 24 | def test_spectrum_document_init_n_decimals_1_no_losses(): 25 | mz = np.array([10, 20, 30, 40], dtype="float") 26 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 27 | metadata = dict(precursor_mz=100.0) 28 | spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 29 | spectrum_document = SpectrumDocument(spectrum, n_decimals=1) 30 | 31 | assert spectrum_document.n_decimals == 1 32 | assert len(spectrum_document) == 4 33 | assert spectrum_document.words == [ 34 | "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0" 35 | ] 36 | assert next(spectrum_document) == "peak@10.0" 37 | 38 | 39 | def test_spectrum_document_init_default_with_losses(): 40 | """Use default n_decimal and add losses.""" 41 | mz = np.array([10, 20, 30, 40], dtype="float") 42 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 43 | metadata = dict(precursor_mz=100.0) 44 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 45 | spectrum = add_losses(spectrum_in) 46 | spectrum_document = SpectrumDocument(spectrum) 47 | 48 | assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals" 49 | assert len(spectrum_document) == 8 50 | assert spectrum_document.words == [ 51 | "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00", 52 | "loss@60.00", "loss@70.00", "loss@80.00", "loss@90.00" 53 | ] 54 | assert next(spectrum_document) == "peak@10.00" 55 | 56 | 57 | def test_spectrum_document_init_n_decimals_1(): 58 | """Use n_decimal=1 and add losses.""" 59 | mz = np.array([10, 20, 30, 40], dtype="float") 60 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 61 | metadata = dict(precursor_mz=100.0) 62 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 63 | spectrum = add_losses(spectrum_in) 64 | spectrum_document = SpectrumDocument(spectrum, n_decimals=1) 65 | 66 | assert spectrum_document.n_decimals == 1 67 | assert len(spectrum_document) == 8 68 | assert spectrum_document.words == [ 69 | "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0", 70 | "loss@60.0", "loss@70.0", "loss@80.0", "loss@90.0" 71 | ] 72 | assert next(spectrum_document) == "peak@10.0" 73 | 74 | 75 | def test_spectrum_document_metadata_getter(): 76 | """Test metadata getter""" 77 | mz = np.array([10, 20, 30, 40], dtype="float") 78 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 79 | metadata = {"precursor_mz": 100.0, 80 | "smiles": "testsmiles"} 81 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 82 | spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2) 83 | 84 | assert spectrum_document.n_decimals == 2 85 | assert len(spectrum_document) == 4 86 | assert spectrum_document.metadata == metadata, "Expected different metadata" 87 | assert spectrum_document.get("smiles") == "testsmiles", "Expected different metadata" 88 | assert spectrum_document.words == [ 89 | "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00" 90 | ] 91 | assert next(spectrum_document) == "peak@10.00" 92 | 93 | 94 | def test_spectrum_document_metadata_getter_notallowed_key(): 95 | """Test metadata getter with key that is also attribute""" 96 | mz = np.array([10], dtype="float") 97 | intensities = np.array([0], dtype="float") 98 | metadata = {"smiles": "testsmiles"} 99 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 100 | spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2) 101 | 102 | assert spectrum_document.n_decimals == 2 103 | with pytest.raises(AssertionError) as msg: 104 | spectrum_document.get("n_decimals") 105 | 106 | assert str(msg.value) == "Key cannot be attribute of SpectrumDocument class" 107 | 108 | 109 | def test_spectrum_document_peak_getter(): 110 | """Test peak getter""" 111 | mz = np.array([10, 20, 30, 40], dtype="float") 112 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 113 | metadata = {"precursor_mz": 100.0} 114 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 115 | spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2) 116 | 117 | assert spectrum_document.words == [ 118 | "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00" 119 | ] 120 | assert np.all(spectrum_document.peaks.mz == mz), "Expected different peak m/z" 121 | assert np.all(spectrum_document.peaks.intensities == intensities), "Expected different peaks" 122 | 123 | 124 | def test_spectrum_document_losses_getter(): 125 | """Test losses getter""" 126 | mz = np.array([10, 20, 30, 40], dtype="float") 127 | intensities = np.array([0, 0.01, 0.1, 1], dtype="float") 128 | metadata = {"precursor_mz": 100.0} 129 | spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) 130 | spectrum = add_losses(spectrum_in) 131 | spectrum_document = SpectrumDocument(spectrum, n_decimals=2) 132 | assert np.all(spectrum_document.losses.mz == np.array([60., 70., 80., 90.])), \ 133 | "Expected different losses" 134 | assert np.all(spectrum_document.losses.intensities == intensities[::-1]), \ 135 | "Expected different losses" 136 | -------------------------------------------------------------------------------- /tests/test_vector_operations.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim 3 | import numpy as np 4 | import pytest 5 | from matchms import Spectrum 6 | from spec2vec import SpectrumDocument 7 | from spec2vec.logging_functions import (reset_spec2vec_logger, 8 | set_spec2vec_logger_level) 9 | from spec2vec.vector_operations import (calc_vector, cosine_similarity, 10 | cosine_similarity_matrix) 11 | 12 | 13 | def test_calc_vector(): 14 | """Test deriving a document vector using a pretrained network.""" 15 | spectrum = Spectrum(mz=np.array([100, 150, 200, 250], dtype="float"), 16 | intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"), 17 | metadata={}) 18 | 19 | document = SpectrumDocument(spectrum, n_decimals=1) 20 | model = import_pretrained_model() 21 | vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=1.0) 22 | expected_vector = np.array([0.08982063, -1.43037023, -0.17572929, -0.45750666, 0.44942236, 23 | 1.35530729, -1.8305029, -0.36850534, -0.28393048, -0.34192028]) 24 | assert np.all(vector == pytest.approx(expected_vector, 1e-5)), "Expected different document vector." 25 | 26 | 27 | def test_calc_vector_missing_words_logging(caplog): 28 | """Test using a pretrained network and a missing words.""" 29 | set_spec2vec_logger_level("INFO") 30 | spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"), 31 | intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"), 32 | metadata={}) 33 | 34 | document = SpectrumDocument(spectrum, n_decimals=1) 35 | model = import_pretrained_model() 36 | assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model." 37 | 38 | calc_vector(model, document, intensity_weighting_power=0.5, 39 | allowed_missing_percentage=100.0) 40 | 41 | expected_msg1 = "spec2vec:vector_operations.py" 42 | expected_msg2 = "Found 1 word(s) missing in the model." 43 | assert expected_msg1 in caplog.text, "Expected particular warning message." 44 | assert expected_msg2 in caplog.text, "Expected particular warning message." 45 | reset_spec2vec_logger() 46 | 47 | 48 | def test_calc_vector_higher_than_allowed_missing_percentage(caplog): 49 | """Test using a pretrained network and a missing word percentage above allowed.""" 50 | spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"), 51 | intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"), 52 | metadata={}) 53 | 54 | document = SpectrumDocument(spectrum, n_decimals=1) 55 | model = import_pretrained_model() 56 | assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model." 57 | 58 | calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=16.0) 59 | 60 | expected_message_part = "Missing percentage (16.23%) is above set maximum." 61 | assert expected_message_part in caplog.text, "Expected particular warning message." 62 | 63 | 64 | def test_calc_vector_within_allowed_missing_percentage(): 65 | """Test using a pretrained network and a missing word percentage within allowed.""" 66 | spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"), 67 | intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"), 68 | metadata={}) 69 | 70 | document = SpectrumDocument(spectrum, n_decimals=1) 71 | model = import_pretrained_model() 72 | vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=17.0) 73 | expected_vector = np.array([0.12775915, -1.17673617, -0.14598507, -0.40189132, 0.36908966, 74 | 1.11608575, -1.46774333, -0.31442554, -0.23168877, -0.29420064]) 75 | assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model." 76 | assert np.all(vector == pytest.approx(expected_vector, 1e-5)), "Expected different document vector." 77 | 78 | 79 | def test_calc_vector_no_words_in_model(caplog): 80 | """Test using a pretrained network which covers no 'word' of a given spectrum.""" 81 | spectrum = Spectrum(mz=np.array([11.0, 100.8, 200.8], dtype="float"), 82 | intensities=np.array([0.1, 0.2, 1.0], dtype="float"), 83 | metadata={}) 84 | 85 | document = SpectrumDocument(spectrum, n_decimals=1) 86 | model = import_pretrained_model() 87 | for i in range(3): 88 | assert document.words[i] not in model.wv.key_to_index, \ 89 | "Expected word to be missing from given model." 90 | 91 | vector = calc_vector(model, document, intensity_weighting_power=0.5) 92 | 93 | expected_message_part = "An empty vector will be returned." 94 | assert expected_message_part in caplog.text, "Expected particular warning message." 95 | assert np.all(vector == np.zeros(10)), "Expected empty vector" 96 | 97 | 98 | def import_pretrained_model(): 99 | """Helper function to import pretrained word2vec model.""" 100 | repository_root = os.path.join(os.path.dirname(__file__), "..") 101 | model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") 102 | return gensim.models.Word2Vec.load(model_file) 103 | 104 | 105 | @pytest.mark.parametrize("numba_compiled", [True, False]) 106 | def test_cosine_similarity(numba_compiled): 107 | """Test cosine similarity score calculation.""" 108 | vector1 = np.array([1, 1, 0, 0]) 109 | vector2 = np.array([1, 1, 1, 1]) 110 | 111 | if numba_compiled: 112 | score11 = cosine_similarity(vector1, vector1) 113 | score12 = cosine_similarity(vector1, vector2) 114 | score22 = cosine_similarity(vector2, vector2) 115 | else: 116 | score11 = cosine_similarity.py_func(vector1, vector1) 117 | score12 = cosine_similarity.py_func(vector1, vector2) 118 | score22 = cosine_similarity.py_func(vector2, vector2) 119 | 120 | assert score12 == 2 / np.sqrt(2 * 4), "Expected different score." 121 | assert score11 == score22 == 1.0, "Expected different score." 122 | 123 | 124 | @pytest.mark.parametrize("numba_compiled", [True, False]) 125 | def test_cosine_similarity_all_zeros(numba_compiled): 126 | """Test cosine similarity score calculation with empty vector.""" 127 | vector1 = np.array([0, 0, 0, 0]) 128 | vector2 = np.array([1, 1, 1, 1]) 129 | 130 | if numba_compiled: 131 | score11 = cosine_similarity(vector1, vector1) 132 | score12 = cosine_similarity(vector1, vector2) 133 | score22 = cosine_similarity(vector2, vector2) 134 | else: 135 | score11 = cosine_similarity.py_func(vector1, vector1) 136 | score12 = cosine_similarity.py_func(vector1, vector2) 137 | score22 = cosine_similarity.py_func(vector2, vector2) 138 | 139 | assert score11 == score12 == 0.0, "Expected different score." 140 | assert score22 == 1.0, "Expected different score." 141 | 142 | 143 | @pytest.mark.parametrize("numba_compiled", [True, False]) 144 | def test_cosine_similarity_matrix(numba_compiled): 145 | """Test cosine similarity scores calculation using int32 input..""" 146 | vectors1 = np.array([[1, 1, 0, 0], 147 | [1, 0, 1, 1]], dtype=np.int32) 148 | vectors2 = np.array([[0, 1, 1, 0], 149 | [0, 0, 1, 1]], dtype=np.int32) 150 | 151 | if numba_compiled: 152 | scores = cosine_similarity_matrix(vectors1, vectors2) 153 | else: 154 | scores = cosine_similarity_matrix.py_func(vectors1, vectors2) 155 | expected_scores = np.array([[0.5, 0.], 156 | [0.40824829, 0.81649658]]) 157 | assert scores == pytest.approx(expected_scores, 1e-7), "Expected different scores." 158 | 159 | 160 | @pytest.mark.parametrize("numba_compiled", [True, False]) 161 | def test_cosine_similarity_floats_matrix(numba_compiled): 162 | """Test cosine similarity scores calculation using float64 input..""" 163 | vectors1 = np.array([[1, 1, 0, 0], 164 | [1, 0, 1, 1]], dtype=np.float64) 165 | vectors2 = np.array([[0, 1, 1, 0], 166 | [0, 0, 1, 1]], dtype=np.float64) 167 | 168 | if numba_compiled: 169 | scores = cosine_similarity_matrix(vectors1, vectors2) 170 | else: 171 | scores = cosine_similarity_matrix.py_func(vectors1, vectors2) 172 | expected_scores = np.array([[0.5, 0.], 173 | [0.40824829, 0.81649658]]) 174 | assert scores == pytest.approx(expected_scores, 1e-7), "Expected different scores." 175 | 176 | 177 | @pytest.mark.parametrize("numba_compiled", [True, False]) 178 | def test_cosine_similarity_matrix_input_cloned(numba_compiled): 179 | """Test if score implementation clones the input correctly.""" 180 | vectors1 = np.array([[2, 2, 0, 0], 181 | [2, 0, 2, 2]]) 182 | vectors2 = np.array([[0, 2, 2, 0], 183 | [0, 0, 2, 2]]) 184 | 185 | if numba_compiled: 186 | cosine_similarity_matrix(vectors1, vectors2) 187 | else: 188 | cosine_similarity_matrix.py_func(vectors1, vectors2) 189 | 190 | assert np.all(vectors1 == np.array([[2, 2, 0, 0], 191 | [2, 0, 2, 2]])), "Expected unchanged input." 192 | 193 | 194 | def test_differnt_input_vector_lengths(): 195 | """Test if correct error is raised.""" 196 | vector1 = np.array([0, 0, 0, 0]) 197 | vector2 = np.array([1, 1, 1, 1, 1]) 198 | 199 | with pytest.raises(AssertionError) as msg: 200 | _ = cosine_similarity(vector1, vector2) 201 | 202 | expected_message = "Input vector must have same shape." 203 | assert expected_message == str(msg.value), "Expected particular error message." 204 | -------------------------------------------------------------------------------- /tests/test_version_string_consistency.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from spec2vec import __version__ as expected_version 4 | 5 | 6 | def test_version_string_consistency(): 7 | """Check whether version in conda/meta.yaml is consistent with that in spec2vec.__version__""" 8 | 9 | repository_root = os.path.join(os.path.dirname(__file__), "..") 10 | fixture = os.path.join(repository_root, "conda", "meta.yaml") 11 | 12 | with open(fixture, "r", encoding="utf-8") as f: 13 | metayaml_contents = f.read() 14 | 15 | match = re.search(r"^{% set version = \"(?P.*)\" %}$", metayaml_contents, re.MULTILINE) 16 | actual_version = match["semver"] 17 | 18 | assert expected_version == actual_version, "Expected version string used in conda/meta.yaml to be consistent with" \ 19 | " that in spec2vec.__version__" 20 | --------------------------------------------------------------------------------