├── .editorconfig
├── .github
    └── workflows
    │   ├── CI_build.yml
    │   └── pypi_publish.yml
├── .gitignore
├── .prospector.yml
├── .readthedocs.yml
├── .zenodo.json
├── CHANGELOG.md
├── CITATION.cff
├── CODE_OF_CONDUCT.rst
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── conda
    ├── README.md
    ├── environment-build.yml
    ├── environment-dev.yml
    ├── environment.yml
    └── meta.yaml
├── integration-tests
    ├── test_user_workflow_spec2vec.model
    └── test_user_workflow_spec2vec.py
├── readthedocs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── setup.cfg
├── setup.py
├── sonar-project.properties
├── spec2vec
    ├── Document.py
    ├── Spec2Vec.py
    ├── SpectrumDocument.py
    ├── __init__.py
    ├── __version__.py
    ├── logging_functions.py
    ├── model_building.py
    ├── serialization
    │   ├── __init__.py
    │   ├── model_exporting.py
    │   └── model_importing.py
    ├── utils.py
    └── vector_operations.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── data
        ├── model.json
        ├── pesticides.mgf
        └── weights.npy
    ├── test_document.py
    ├── test_logging.py
    ├── test_model_building.py
    ├── test_model_serialization.py
    ├── test_spec2vec.py
    ├── test_spectrum_document.py
    ├── test_vector_operations.py
    └── test_version_string_consistency.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: http://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = true
 5 | 
 6 | # Unix-style newlines with a newline ending every file
 7 | [*]
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | trim_trailing_whitespace = true
11 | charset = utf-8
12 | 
13 | # 4 space indentation
14 | [*.{py,java,r,R}]
15 | indent_style = space
16 | indent_size = 4
17 | 
18 | # 2 space indentation
19 | [*.{js,json,y{a,}ml,html,cwl}]
20 | indent_style = space
21 | indent_size = 2
22 | 
23 | [*.{md,Rmd,rst}]
24 | trim_trailing_whitespace = false
25 | indent_style = space
26 | indent_size = 2
27 | 


--------------------------------------------------------------------------------
/.github/workflows/CI_build.yml:
--------------------------------------------------------------------------------
  1 | name: CI Build
  2 | 
  3 | on:
  4 |   push:
  5 |   pull_request:
  6 |     types: [opened, reopened]
  7 | 
  8 | jobs:
  9 | 
 10 |   first_check:
 11 |     name: first code check / python-3.8 / ubuntu-latest
 12 |     runs-on: ubuntu-latest
 13 |     steps:
 14 |       - uses: actions/checkout@v2
 15 |       - name: Set up Python
 16 |         uses: actions/setup-python@v1
 17 |         with:
 18 |           python-version: 3.8
 19 |       - name: Python info
 20 |         run: |
 21 |           which python
 22 |           python --version
 23 |       - name: Build package and create dev environment
 24 |         run: |
 25 |           python -m pip install --upgrade pip
 26 |           pip install -e .[dev]
 27 |       - name: Show pip list
 28 |         run: |
 29 |           pip list
 30 |       - name: Test with coverage
 31 |         run: |
 32 |           pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml
 33 |       - name: Correct coverage paths
 34 |         run: sed -i "s+$PWD/++g" coverage.xml
 35 |       - name: Check style against standards using prospector
 36 |         shell: bash -l {0}
 37 |         run: prospector -o grouped -o pylint:pylint-report.txt
 38 |       - name: Check whether import statements are used consistently
 39 |         shell: bash -l {0}
 40 |         run: isort --check-only --diff --conda-env spec2vec-dev .
 41 |       - name: SonarCloud Scan
 42 |         if: github.repository == 'iomega/spec2vec'
 43 |         uses: sonarsource/sonarcloud-github-action@master
 44 |         env:
 45 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 46 |           SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
 47 | 
 48 |   build_pypi:
 49 |     name: Pypi and documentation build / python-${{ matrix.python-version }} / ${{ matrix.os }}
 50 |     runs-on: ${{ matrix.os }}
 51 |     needs: first_check
 52 |     strategy:
 53 |       fail-fast: false
 54 |       matrix:
 55 |         os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
 56 |         python-version: ['3.7', '3.8', '3.9']
 57 |         exclude:
 58 |           # already tested in first_check job
 59 |           - python-version: 3.8
 60 |             os: ubuntu-latest
 61 |     steps:
 62 |       - uses: actions/checkout@v2
 63 |       - name: Set up Python ${{ matrix.python-version }}
 64 |         uses: actions/setup-python@v4
 65 |         with:
 66 |           python-version: ${{ matrix.python-version }}
 67 |       - name: Python info
 68 |         run: |
 69 |           which python
 70 |           python --version
 71 |       - name: Install dependencies
 72 |         run: |
 73 |           python -m pip install --upgrade pip
 74 |       - name: Build package
 75 |         run: |
 76 |           pip install wheel twine
 77 |           python setup.py sdist bdist_wheel
 78 |       - name: Test package
 79 |         run: |
 80 |           python -m twine check dist/*
 81 |       - name: Show pip list
 82 |         run: |
 83 |           pip list
 84 |       - name: Install development dependencies
 85 |         run: |
 86 |           pip install -e .[dev]
 87 |       - name: Test
 88 |         run: |
 89 |           pytest
 90 |       - name: Show environment variables
 91 |         shell: bash -l {0}
 92 |         run: |
 93 |           env | sort
 94 |       - name: Build documentation
 95 |         shell: bash -l {0}
 96 |         run: |
 97 |           make coverage doctest html
 98 |         working-directory: readthedocs/
 99 |         env:
100 |           SPHINXOPTS: "-n"  # enable nit-picky mode
101 |       - name: Check documentation coverage threshold
102 |         if: matrix.os == 'ubuntu-latest'
103 |         run: |
104 |           cat readthedocs/_build/coverage/python.txt
105 |           UNCOVERED_MEMBERS=$(grep '*' readthedocs/_build/coverage/python.txt | wc -l)
106 |           UNCOVERED_MEMBERS_ALLOWED=5
107 |           if (( $UNCOVERED_MEMBERS > $UNCOVERED_MEMBERS_ALLOWED )) ; then echo "There are currently ${UNCOVERED_MEMBERS} uncovered members in the documentation, which is more than the ${UNCOVERED_MEMBERS_ALLOWED} allowed." && exit 1;fi
108 |           echo "The code is sufficiently documented with ${UNCOVERED_MEMBERS} uncovered members out of ${UNCOVERED_MEMBERS_ALLOWED} allowed.";
109 | 
110 |   anaconda_build:
111 |     name: Anaconda build / python-3.7 / ubuntu-latest
112 |     runs-on: ubuntu-latest
113 |     strategy:
114 |       fail-fast: false
115 |     needs: first_check
116 |     steps:
117 |       - uses: actions/checkout@v2
118 |         with:
119 |           fetch-depth: "0"
120 |       - name: Create spec2vec-build environment
121 |         uses: conda-incubator/setup-miniconda@v2
122 |         with:
123 |           activate-environment: spec2vec-build
124 |           auto-update-conda: true
125 |           environment-file: conda/environment-build.yml
126 |           python-version: 3.8
127 |       - name: Show conda config
128 |         shell: bash -l {0}
129 |         run: |
130 |           conda info
131 |           conda list
132 |           conda config --show-sources
133 |           conda config --show
134 |           conda env list
135 |       - name: Python info
136 |         shell: bash -l {0}
137 |         run: |
138 |           which python
139 |           python --version
140 |       - name: Show environment variables
141 |         shell: bash -l {0}
142 |         run: |
143 |           env | sort
144 |       - name: Build the conda package
145 |         shell: bash -l {0}
146 |         run: |
147 |           export BUILDDIR=$RUNNER_TEMP/spec2vec/_build
148 |           [ "$RUNNING_OS" = "Windows" ] && export BUILDDIR=$RUNNER_TEMP\\spec2vec\\_build\\
149 |           conda config --set anaconda_upload no
150 |           conda build --no-include-recipe \
151 |             --channel bioconda --channel conda-forge \
152 |             --croot ${BUILDDIR} \
153 |             ./conda
154 |       - name: Upload package artifact from build
155 |         uses: actions/upload-artifact@v2
156 |         with:
157 |           name: conda-package-artifact
158 |           path: ${{ runner.temp }}/spec2vec/_build
159 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi_publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: 3.7
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |         python setup.py sdist bdist_wheel
21 |     - name: Publish package
22 |       uses: pypa/gh-action-pypi-publish@release/v1
23 |       with:
24 |         user: __token__
25 |         password: ${{ secrets.PYPI_TOKEN }}
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.egg-info
 3 | *.eggs
 4 | .ipynb_checkpoints
 5 | 
 6 | build
 7 | dist
 8 | .cache
 9 | __pycache__
10 | 
11 | htmlcov
12 | .coverage
13 | coverage.xml
14 | .pytest_cache
15 | pylint-report.txt
16 | xunit-result.xml
17 | .scannerwork/
18 | 
19 | docs/_build
20 | docs/apidocs
21 | 
22 | # ide
23 | .idea
24 | .eclipse
25 | .vscode
26 | 
27 | # Mac
28 | .DS_Store
29 | config.py
30 | output/
31 | /data/
32 | models_trained/
33 | computed_results/
34 | notebooks/.ipynb_checkpoints/
35 | __pycache__/
36 | 
37 | 
38 | # conda build directory
39 | /_build
40 | 


--------------------------------------------------------------------------------
/.prospector.yml:
--------------------------------------------------------------------------------
 1 | # prospector configuration file
 2 | 
 3 | ---
 4 | 
 5 | output-format: grouped
 6 | 
 7 | strictness: medium
 8 | doc-warnings: false
 9 | test-warnings: true
10 | member-warnings: false
11 | 
12 | ignore-paths:
13 |   - readthedocs
14 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   builder: html
11 |   configuration: readthedocs/conf.py
12 | 
13 | python:
14 |   version: 3.7
15 |   install:
16 |     - method: pip
17 |       path: .
18 | conda:
19 |   environment: conda/environment-dev.yml
20 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "creators": [
 3 |       {
 4 |          "affiliation": "Netherlands eScience Center", 
 5 |          "name": "Huber, Florian", 
 6 |          "orcid": "0000-0002-3535-9406"
 7 |       }, 
 8 |       {
 9 |          "affiliation": "Wageningen University and Research", 
10 |          "name": "van der Hooft, Justin J. J.", 
11 |          "orcid": "0000-0002-9340-5511"
12 |       }, 
13 |       {
14 |          "affiliation": "Netherlands eScience Center", 
15 |          "name": "Spaaks, Jurriaan H.", 
16 |          "orcid": "0000-0002-7064-4069"
17 |       }, 
18 |       {
19 |          "affiliation": "Netherlands eScience Center", 
20 |          "name": "Diblen, Faruk", 
21 |          "orcid": "0000-0002-0989-929X"
22 |       }, 
23 |       {
24 |          "affiliation": "Netherlands eScience Center", 
25 |          "name": "Verhoeven, Stefan", 
26 |          "orcid": "0000-0002-5821-2060"
27 |       }, 
28 |       {
29 |          "affiliation": "Netherlands eScience Center", 
30 |          "name": "de Jonge, Niek", 
31 |          "orcid": "0000-0002-3054-6210"
32 |       }, 
33 |       {
34 |          "affiliation": "Netherlands eScience Center", 
35 |          "name": "Geng, Cunliang", 
36 |          "orcid": "0000-0002-1409-8358"
37 |       }, 
38 |       {
39 |          "affiliation": "Netherlands eScience Center", 
40 |          "name": "Meijer, Christiaan", 
41 |          "orcid": "0000-0002-5529-5761"
42 |       }, 
43 |       {
44 |          "affiliation": "University of Glasgow", 
45 |          "name": "Rogers, Simon", 
46 |          "orcid": "0000-0003-3578-4477"
47 |       }, 
48 |       {
49 |          "affiliation": "Netherlands eScience Center", 
50 |          "name": "Belloum, Adam", 
51 |          "orcid": "0000-0001-6306-6937"
52 |       }, 
53 |       {
54 |          "affiliation": "Netherlands eScience Center", 
55 |          "name": "Spreeuw, Hanno", 
56 |          "orcid": "0000-0002-5057-0322"
57 |       }
58 |       {
59 |          "affiliation": "ICS, Masaryk University",
60 |          "name": "Skoryk, Maksym",
61 |          "orcid": "0000-0003-2056-8018"
62 |       }
63 |    ], 
64 |    "description": "Word2Vec based similarity measure of mass spectrometry data.", 
65 |    "keywords": [
66 |       "Word2Vec", 
67 |       "similarity measures", 
68 |       "mass spectrometry"
69 |    ], 
70 |    "license": {
71 |       "id": "Apache-2.0"
72 |    }, 
73 |    "title": "spec2vec"
74 | }
75 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  7 | 
  8 | ## [Unreleased]
  9 | 
 10 | ## [0.8.1] - 2024-08-06
 11 | ### Changed
 12 | - Set max matchms to 0.26.4
 13 | - Set max scipy to 1.10.1
 14 | 
 15 | ## [0.8.0] - 2022-01-06
 16 | 
 17 | ### Changed
 18 | 
 19 | - Minor changes to make tests pass with new matchms versions (>=0.18.0). Should nearly always be backwards compatible though.
 20 | - Now dependency requirement is set to `matchms>=0.14.0`
 21 | 
 22 | ## [0.7.0] - 2022-10-01
 23 | 
 24 | ### Added
 25 | 
 26 | - added `spec2vec.serialization` subpackage to import and export `Word2Vec` models to/from disk without Pickle 
 27 |   (via `import_model` and `export_model` respectively) [#80](https://github.com/iomega/spec2vec/pull/80)
 28 | 
 29 | ### Changed
 30 | 
 31 | - bumped **gensim** version to `>=4.2.0` in dependencies [#84](https://github.com/iomega/spec2vec/pull/84)
 32 | 
 33 | ### Fixed
 34 | 
 35 | - updated Code examples in documentation to recent changes in matchms.
 36 | 
 37 | ## [0.6.0] - 2022-01-03
 38 | 
 39 | ### Added
 40 | 
 41 | - Logging (replacing former print statements) including options to write logs to file [#73](https://github.com/iomega/spec2vec/pull/73)
 42 | - Now supports Python 3.9 (including CI test runs) [#40](https://github.com/iomega/spec2vec/issues/40)
 43 | 
 44 | ### Changed
 45 | 
 46 | - missing words percentage above the `allowed_missing_percentage` no longer causes an expection but only leads to raising a warning [#73](https://github.com/iomega/spec2vec/pull/73)
 47 | - default setting for `allowed_missing_percentage` to 10.0 to be less strict on model coverage [#72](https://github.com/iomega/spec2vec/pull/72)
 48 | 
 49 | ### Fixed
 50 | 
 51 | - Can now also handle spectra in which no peak is known to the model (will return warning + empty vector) [#73](https://github.com/iomega/spec2vec/pull/73)
 52 | 
 53 | ## [0.5.0] - 2021-06-18
 54 | 
 55 | ### Changed
 56 | 
 57 | - Spec2Vec is now using gensim >= 4.0.0 [#62](https://github.com/iomega/spec2vec/pull/62)
 58 | 
 59 | ## [0.4.0] - 2021-02-10
 60 | 
 61 | ### Changed
 62 | 
 63 | - refactored `Spec2Vec` to now accept `Spectrum` or `SpectrumDocument` as input [#51](https://github.com/iomega/spec2vec/issues/51)
 64 | 
 65 | ### Fixed
 66 | 
 67 | - updated and fixed code examples  [#51](https://github.com/iomega/spec2vec/issues/51)
 68 | - updated and fixed attribute typing [#51](https://github.com/iomega/spec2vec/issues/51)
 69 | 
 70 | ## [0.3.4] - 2021-02-10
 71 | 
 72 | ### Changed
 73 | 
 74 | - update required numba version to >=0.51 to avoid issues between numba and numpy [#55](https://github.com/iomega/spec2vec/pull/55)
 75 | 
 76 | ## [0.3.3] - 2021-02-09
 77 | 
 78 | ### Added
 79 | 
 80 | - Metadata getter method for `SpectrumDocument` [#50](https://github.com/iomega/spec2vec/pull/50)
 81 | - Implement `is_symmetric=True` option for `Spec2Vec.matrix` method [#53](https://github.com/iomega/spec2vec/pull/53)
 82 | 
 83 | ### Changed
 84 | 
 85 | - Change default for `n_decimals` parameter from 1 to 2 [#50](https://github.com/iomega/spec2vec/pull/50)
 86 | 
 87 | ## [0.3.2] - 2020-12-03
 88 | 
 89 | ### Changed
 90 | 
 91 | - Add optional progress bar for spec2vec.matrix() calculations (default is False) [#43](https://github.com/iomega/spec2vec/pull/43)
 92 | 
 93 | ## [0.3.1] - 2020-09-23
 94 | 
 95 | ### Changed
 96 | 
 97 | - Implement faster, numba-based cosine similarity function [#29](https://github.com/iomega/spec2vec/pull/29)
 98 | 
 99 | ## [0.3.0] - 2020-09-16
100 | 
101 | ### Added
102 | 
103 | - Support for Python 3.8 [#35](https://github.com/iomega/spec2vec/pull/35)
104 | 
105 | ### Changed
106 | 
107 | - Refactored Spec2Vec class to provide .pair() and .matrix() methods [#35](https://github.com/iomega/spec2vec/pull/35)
108 | 
109 | ### Removed
110 | 
111 | - Spec2VecParallel (is now included as Spec2Vec.matrix()) [#35](https://github.com/iomega/spec2vec/pull/35)
112 | 
113 | ## [0.2.0] - 2020-06-18
114 | 
115 | ### Added
116 | 
117 | - Wrapper for training a gensim word2vec model [#13](https://github.com/iomega/spec2vec/tree/13-gensim-wrapper)
118 | - Basic logger for word2vec model training [#11](https://github.com/iomega/spec2vec/issues/11)
119 | 
120 | ### Changed
121 | 
122 | - Extend spec2vec similarity calculation to handle missing words [#9](https://github.com/iomega/spec2vec/issues/9)
123 | - Extend documentation and given code examples [#15](https://github.com/iomega/spec2vec/issues/15)
124 | - Updated the integration test to work with matchms 0.4.0 [#7](https://github.com/iomega/spec2vec/issues/7)
125 | 
126 | ## [0.1.0] - 2020-06-02
127 | 
128 | ### Added
129 | 
130 | - Matchms as dependency [#4](https://github.com/iomega/spec2vec/pull/4)
131 | - Bump2version config
132 | 
133 | ### Changed
134 | 
135 | - Splitted spec2vec from [matchms]. See (https://github.com/matchms/matchms) [#1](https://github.com/iomega/spec2vec/pull/1) [#4](https://github.com/iomega/spec2vec/pull/4)
136 |   - Updated packaging related configuration
137 |   - Update the GH Actions workflows
138 |   - Updated the documentation
139 |   - Updated the badges
140 |   - Updated the integration and unit tests
141 |   - Zenodo metadata
142 |   
143 | ### Fixed
144 | 
145 | ### Removed
146 | 
147 | - Fossa configuration
148 | - Flowchart
149 | 
150 | [Unreleased]: https://github.com/iomega/spec2vec/compare/0.8.0...HEAD
151 | [0.8.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0
152 | [0.7.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0
153 | [0.6.0]: https://github.com/iomega/spec2vec/compare/0.5.0...0.6.0
154 | [0.5.0]: https://github.com/iomega/spec2vec/compare/0.4.0...0.5.0
155 | [0.4.0]: https://github.com/iomega/spec2vec/compare/0.3.4...0.4.0
156 | [0.3.4]: https://github.com/iomega/spec2vec/compare/0.3.3...0.3.4
157 | [0.3.3]: https://github.com/iomega/spec2vec/compare/0.3.2...0.3.3
158 | [0.3.2]: https://github.com/iomega/spec2vec/compare/0.3.1...0.3.2
159 | [0.3.1]: https://github.com/iomega/spec2vec/compare/0.3.0...0.3.1
160 | [0.3.0]: https://github.com/iomega/spec2vec/compare/0.2.0...0.3.0
161 | [0.2.0]: https://github.com/iomega/spec2vec/compare/0.1.0...0.2.0
162 | [0.1.0]: https://github.com/iomega/spec2vec/releases/tag/0.1.0
163 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | abstract: "Word2Vec based similarity measure of mass spectrometry data."
 4 | authors:
 5 |   -
 6 |     affiliation: "Netherlands eScience Center"
 7 |     family-names: Huber
 8 |     given-names: Florian
 9 |     orcid: "https://orcid.org/0000-0002-3535-9406"
10 |   -
11 |     affiliation: "Wageningen University and Research"
12 |     family-names: Hooft
13 |     name-particle: van der
14 |     given-names: Justin J. J.
15 |     orcid: "https://orcid.org/0000-0002-9340-5511"
16 |   -
17 |     affiliation: "Netherlands eScience Center"
18 |     family-names: Spaaks
19 |     given-names: Jurriaan H.
20 |     orcid: "https://orcid.org/0000-0002-7064-4069"
21 |   -
22 |     affiliation: "Netherlands eScience Center"
23 |     family-names: Diblen
24 |     given-names: Faruk
25 |     orcid: "https://orcid.org/0000-0002-0989-929X"
26 |   -
27 |     affiliation: "Netherlands eScience Center"
28 |     family-names: Verhoeven
29 |     given-names: Stefan
30 |     orcid: "https://orcid.org/0000-0002-5821-2060"
31 |   -
32 |     affiliation: "Netherlands eScience Center"
33 |     family-names: Geng
34 |     given-names: Cunliang
35 |     orcid: "https://orcid.org/0000-0002-1409-8358"
36 |   -
37 |     affiliation: "Netherlands eScience Center"
38 |     family-names: Meijer
39 |     given-names: Christiaan
40 |     orcid: "https://orcid.org/0000-0002-5529-5761"
41 |   -
42 |     affiliation: "University of Glasgow"
43 |     family-names: Rogers
44 |     given-names: Simon
45 |     orcid: "https://orcid.org/0000-0003-3578-4477"
46 |   -
47 |     affiliation: "Netherlands eScience Center"
48 |     family-names: Belloum
49 |     given-names: Adam
50 |     orcid: "https://orcid.org/0000-0001-6306-6937"
51 |   -
52 |     affiliation: "Netherlands eScience Center"
53 |     family-names: Spreeuw
54 |     given-names: Hanno
55 |     orcid: "https://orcid.org/0000-0002-5057-0322"
56 |   -
57 |     affiliation: "Netherlands eScience Center"
58 |     family-names: de Jonge
59 |     given-names: Niek
60 |     orcid: "https://orcid.org/0000-0002-3054-6210"
61 |   -
62 |     affiliation: "ICS, Masaryk University"
63 |     family-names: Skoryk
64 |     given-names: Maksym
65 |     orcid: "https://orcid.org/0000-0003-2056-8018"
66 | 
67 | cff-version: "1.1.0"
68 | keywords:
69 |   - Word2Vec
70 |   - "similarity measures"
71 |   - "mass spectrometry"
72 | license: "Apache-2.0"
73 | message: "If you use this software, please cite it using these metadata."
74 | repository-code: "https://github.com/iomega/spec2vec"
75 | title: spec2vec
76 | ...
77 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.rst:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | Contributor Covenant Code of Conduct
 3 | ###############################################################################
 4 | 
 5 | Our Pledge
 6 | **********
 7 | 
 8 | In the interest of fostering an open and welcoming environment, we as
 9 | contributors and maintainers pledge to making participation in our project and
10 | our community a harassment-free experience for everyone, regardless of age, body
11 | size, disability, ethnicity, gender identity and expression, level of experience,
12 | education, socio-economic status, nationality, personal appearance, race,
13 | religion, or sexual identity and orientation.
14 | 
15 | Our Standards
16 | *************
17 | 
18 | Examples of behavior that contributes to creating a positive environment
19 | include:
20 | 
21 | * Using welcoming and inclusive language
22 | * Being respectful of differing viewpoints and experiences
23 | * Gracefully accepting constructive criticism
24 | * Focusing on what is best for the community
25 | * Showing empathy towards other community members
26 | 
27 | Examples of unacceptable behavior by participants include:
28 | 
29 | * The use of sexualized language or imagery and unwelcome sexual attention or
30 |   advances
31 | * Trolling, insulting/derogatory comments, and personal or political attacks
32 | * Public or private harassment
33 | * Publishing others' private information, such as a physical or electronic
34 |   address, without explicit permission
35 | * Other conduct which could reasonably be considered inappropriate in a
36 |   professional setting
37 | 
38 | Our Responsibilities
39 | ********************
40 | 
41 | Project maintainers are responsible for clarifying the standards of acceptable
42 | behavior and are expected to take appropriate and fair corrective action in
43 | response to any instances of unacceptable behavior.
44 | 
45 | Project maintainers have the right and responsibility to remove, edit, or
46 | reject comments, commits, code, wiki edits, issues, and other contributions
47 | that are not aligned to this Code of Conduct, or to ban temporarily or
48 | permanently any contributor for other behaviors that they deem inappropriate,
49 | threatening, offensive, or harmful.
50 | 
51 | Scope
52 | *****
53 | 
54 | This Code of Conduct applies both within project spaces and in public spaces
55 | when an individual is representing the project or its community. Examples of
56 | representing a project or community include using an official project e-mail
57 | address, posting via an official social media account, or acting as an appointed
58 | representative at an online or offline event. Representation of a project may be
59 | further defined and clarified by project maintainers.
60 | 
61 | Enforcement
62 | ***********
63 | 
64 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
65 | reported by contacting the project team at generalization@esciencecenter.nl. All
66 | complaints will be reviewed and investigated and will result in a response that
67 | is deemed necessary and appropriate to the circumstances. The project team is
68 | obligated to maintain confidentiality with regard to the reporter of an incident.
69 | Further details of specific enforcement policies may be posted separately.
70 | 
71 | Project maintainers who do not follow or enforce the Code of Conduct in good
72 | faith may face temporary or permanent repercussions as determined by other
73 | members of the project's leadership.
74 | 
75 | Attribution
76 | ***********
77 | 
78 | This Code of Conduct is adapted from the `Contributor Covenant <https://www.contributor-covenant.org>`_, version 1.4,
79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
80 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing guidelines
 2 | 
 3 | We welcome any kind of contribution to our software, from simple comment or question to a full fledged [pull request](https://help.github.com/articles/about-pull-requests/). Please read and follow our [Code of Conduct](CODE_OF_CONDUCT.rst).
 4 | 
 5 | A contribution can be one of the following cases:
 6 | 
 7 | 1. you have a question;
 8 | 1. you think you may have found a bug (including unexpected behavior);
 9 | 1. you want to make some kind of change to the code base (e.g. to fix a bug, to add a new feature, to update documentation);
10 | 1. you want to make a new release of the code base.
11 | 
12 | The sections below outline the steps in each case.
13 | 
14 | ## You have a question
15 | 
16 | 1. use the search functionality [here](https://github.com/iomega/spec2vec/issues) to see if someone already filed the same issue;
17 | 1. if your issue search did not yield any relevant results, make a new issue;
18 | 1. apply the "Question" label; apply other labels when relevant.
19 | 
20 | ## You think you may have found a bug
21 | 
22 | 1. use the search functionality [here](https://github.com/iomega/spec2vec/issues) to see if someone already filed the same issue;
23 | 1. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include:
24 |     - the [SHA hashcode](https://help.github.com/articles/autolinked-references-and-urls/#commit-shas) of the commit that is causing your problem;
25 |     - some identifying information (name and version number) for dependencies you're using;
26 |     - information about the operating system;
27 | 1. apply relevant labels to the newly created issue.
28 | 
29 | ## You want to make some kind of change to the code base
30 | 
31 | 1. (**important**) announce your plan to the rest of the community *before you start working*. This announcement should be in the form of a (new) issue;
32 | 1. (**important**) wait until some kind of consensus is reached about your idea being a good idea;
33 | 1. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest master commit. While working on your feature branch, make sure to stay up to date with the master branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions [here](https://help.github.com/articles/configuring-a-remote-for-a-fork/) and [here](https://help.github.com/articles/syncing-a-fork/));
34 | 1. make sure the existing tests still work by running ``python setup.py test``;
35 | 1. add your own tests (if necessary);
36 | 1. update or expand the documentation;
37 | 1. update the `CHANGELOG.md` file with change;
38 | 1. [push](http://rogerdudler.github.io/git-guide/>) your feature branch to (your fork of) the spec2vec repository on GitHub;
39 | 1. create the pull request, e.g. following the instructions [here](https://help.github.com/articles/creating-a-pull-request/).
40 | 
41 | In case you feel like you've made a valuable contribution, but you don't know how to write or run tests for it, or how to generate the documentation: don't let this discourage you from making the pull request; we can help you! Just go ahead and submit the pull request, but keep in mind that you might be asked to append additional commits to your pull request.
42 | 
43 | ## You want to make a new release of the code base
44 | 
45 | To create release you need write permission on the repository.
46 | 
47 | 1. Check author list in `citation.cff` and `.zenodo.json` files
48 | 1. Bump the version using `bump2version <major|minor|patch>`. For example, `bump2version major` will increase major version numbers everywhere its needed (code, meta, etc.) in the repo.
49 | 1. Update the `CHANGELOG.md` to include changes made
50 | 1. Goto [GitHub release page](https://github.com/iomega/spec2vec/releases)
51 | 1. Press draft a new release button
52 | 1. Fill version, title and description field
53 | 1. Press the Publish Release button
54 | 
55 | A GitHub action will run which will publish the new version to [anaconda](https://anaconda.org/nlesc/spec2vec).
56 | Also a Zenodo entry will be made for the release with its own DOI.
57 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "{}"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 
205 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | `fair-software.nl <https://fair-software.nl/>`_ recommendations:
  2 | 
  3 | |GitHub Badge|
  4 | |License Badge|
  5 | |Conda Badge| |Pypi Badge| |Research Software Directory Badge|
  6 | |Zenodo Badge|
  7 | |CII Best Practices Badge| |Howfairis Badge|
  8 | 
  9 | Code quality checks:
 10 | 
 11 | |GitHub Workflow Status|
 12 | |ReadTheDocs Badge|
 13 | |Sonarcloud Quality Gate Badge| |Sonarcloud Coverage Badge|
 14 | 
 15 | ################################################################################
 16 | spec2vec
 17 | ################################################################################
 18 | **Spec2vec** is a novel spectral similarity score inspired by a natural language processing
 19 | algorithm -- Word2Vec. Where Word2Vec learns relationships between words in sentences,
 20 | **spec2vec** does so for mass fragments and neutral losses in MS/MS spectra.
 21 | The spectral similarity score is based on spectral embeddings learnt
 22 | from the fragmental relationships within a large set of spectral data. 
 23 | 
 24 | If you use **spec2vec** for your research, please cite the following references:
 25 | 
 26 | Huber F, Ridder L, Verhoeven S, Spaaks JH, Diblen F, Rogers S, van der Hooft JJJ, (2021) "Spec2Vec: Improved mass spectral similarity scoring through learning of structural relationships". PLoS Comput Biol 17(2): e1008724. `doi:10.1371/journal.pcbi.1008724 <https://doi.org/10.1371/journal.pcbi.1008724>`_
 27 | 
 28 | (and if you use **matchms** as well:
 29 | F. Huber, S. Verhoeven, C. Meijer, H. Spreeuw, E. M. Villanueva Castilla, C. Geng, J.J.J. van der Hooft, S. Rogers, A. Belloum, F. Diblen, J.H. Spaaks, (2020). "matchms - processing and similarity evaluation of mass spectrometry data". Journal of Open Source Software, 5(52), 2411, https://doi.org/10.21105/joss.02411 )
 30 | 
 31 | Thanks!
 32 | 
 33 | 
 34 | 
 35 | .. |GitHub Badge| image:: https://img.shields.io/badge/github-repo-000.svg?logo=github&labelColor=gray&color=blue
 36 |    :target: https://github.com/iomega/spec2vec
 37 |    :alt: GitHub Badge
 38 | 
 39 | .. |License Badge| image:: https://img.shields.io/github/license/iomega/spec2vec
 40 |    :target: https://github.com/iomega/spec2vec
 41 |    :alt: License Badge
 42 | 
 43 | .. |Conda Badge| image:: https://img.shields.io/conda/v/bioconda/spec2vec?color=blue
 44 |    :target: https://bioconda.github.io/recipes/spec2vec/README.html
 45 |    :alt: Conda Badge (Bioconda)
 46 | 
 47 | .. |Pypi Badge| image:: https://img.shields.io/pypi/v/spec2vec?color=blue
 48 |    :target: https://pypi.org/project/spec2vec/
 49 |    :alt: spec2vec on PyPI
 50 | 
 51 | .. |Research Software Directory Badge| image:: https://img.shields.io/badge/rsd-spec2vec-00a3e3.svg
 52 |    :target: https://www.research-software.nl/software/spec2vec
 53 |    :alt: Research Software Directory Badge
 54 | 
 55 | .. |Zenodo Badge| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3873169.svg
 56 |    :target: https://doi.org/10.5281/zenodo.3873169
 57 |    :alt: Zenodo Badge
 58 | 
 59 | .. |CII Best Practices Badge| image:: https://bestpractices.coreinfrastructure.org/projects/3967/badge
 60 |    :target: https://bestpractices.coreinfrastructure.org/projects/3967
 61 |    :alt: CII Best Practices Badge
 62 |    
 63 | .. |Howfairis Badge| image:: https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green
 64 |    :target: https://fair-software.eu
 65 |    :alt: Howfairis Badge
 66 | 
 67 | .. |ReadTheDocs Badge| image:: https://readthedocs.org/projects/spec2vec/badge/?version=latest
 68 |     :alt: Documentation Status
 69 |     :scale: 100%
 70 |     :target: https://spec2vec.readthedocs.io/en/latest/?badge=latest
 71 | 
 72 | .. |Sonarcloud Quality Gate Badge| image:: https://sonarcloud.io/api/project_badges/measure?project=iomega_spec2vec&metric=alert_status
 73 |    :target: https://sonarcloud.io/dashboard?id=iomega_spec2vec
 74 |    :alt: Sonarcloud Quality Gate
 75 | 
 76 | .. |Sonarcloud Coverage Badge| image:: https://sonarcloud.io/api/project_badges/measure?project=iomega_spec2vec&metric=coverage
 77 |    :target: https://sonarcloud.io/component_measures?id=iomega_spec2vec&metric=Coverage&view=list
 78 |    :alt: Sonarcloud Coverage
 79 | 
 80 | .. |GitHub Workflow Status| image:: https://img.shields.io/github/actions/workflow/status/matchms/spec2vec/CI_build.yml?branch=master
 81 |    :target: https://img.shields.io/github/workflow/status/iomega/spec2vec/CI%20Build
 82 |    :alt: GitHub Workflow Status
 83 | 
 84 | 
 85 | ***********************
 86 | Documentation for users
 87 | ***********************
 88 | For more extensive documentation `see our readthedocs <https://spec2vec.readthedocs.io/en/latest/>`_ or get started with our `spec2vec introduction tutorial <https://blog.esciencecenter.nl/build-a-mass-spectrometry-analysis-pipeline-in-python-using-matchms-part-ii-spec2vec-8aa639571018>`_.
 89 | 
 90 | Versions
 91 | ========
 92 | Since version `0.5.0` Spec2Vec uses `gensim >= 4.0.0` which should make it faster and more future proof. Model trained with older versions should still be importable without any issues. If you had scripts that used additional gensim code, however, those might occationally need some adaptation, see also the `gensim documentation on how to migrate your code <https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4>`_.
 93 | 
 94 | 
 95 | Installation
 96 | ============
 97 | 
 98 | 
 99 | Prerequisites:  
100 | 
101 | - Python 3.7, 3.8, or 3.9  
102 | - Recommended: Anaconda
103 | 
104 | We recommend installing spec2vec from Anaconda Cloud with
105 | 
106 | .. code-block:: console
107 | 
108 |   conda create --name spec2vec python=3.8
109 |   conda activate spec2vec
110 |   conda install --channel bioconda --channel conda-forge spec2vec
111 | 
112 | Alternatively, spec2vec can also be installed using ``pip``. When using spec2vec together with ``matchms`` it is important to note that only the Anaconda install will make sure that also ``rdkit`` is installed properly, which is requried for a few matchms filter functions (it is not required for any spec2vec related functionalities though).
113 | 
114 | .. code-block:: console
115 | 
116 |   pip install spec2vec
117 | 
118 | Examples
119 | ========
120 | Below a code example of how to process a large data set of reference spectra to
121 | train a word2vec model from scratch. Spectra are converted to documents using ``SpectrumDocument`` which converts spectrum peaks into "words" according to their m/z ratio (for instance "peak@100.39"). A new word2vec model can then trained using ``train_new_word2vec_model`` which will set the training parameters to spec2vec defaults unless specified otherwise. Word2Vec models learn from co-occurences of peaks ("words") across many different spectra.
122 | To get a model that can give a meaningful representation of a set of
123 | given spectra it is desirable to train the model on a large and representative
124 | dataset.
125 | 
126 | .. code-block:: python
127 | 
128 |     import os
129 |     import matchms.filtering as msfilters
130 |     from matchms.importing import load_from_mgf
131 |     from spec2vec import SpectrumDocument
132 |     from spec2vec.model_building import train_new_word2vec_model
133 | 
134 |     def spectrum_processing(s):
135 |         """This is how one would typically design a desired pre- and post-
136 |         processing pipeline."""
137 |         s = msfilters.default_filters(s)
138 |         s = msfilters.add_parent_mass(s)
139 |         s = msfilters.normalize_intensities(s)
140 |         s = msfilters.reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5, n_max=500)
141 |         s = msfilters.select_by_mz(s, mz_from=0, mz_to=1000)
142 |         s = msfilters.add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
143 |         s = msfilters.require_minimum_number_of_peaks(s, n_required=10)
144 |         return s
145 | 
146 |     # Load data from MGF file and apply filters
147 |     spectrums = [spectrum_processing(s) for s in load_from_mgf("reference_spectrums.mgf")]
148 | 
149 |     # Omit spectrums that didn't qualify for analysis
150 |     spectrums = [s for s in spectrums if s is not None]
151 | 
152 |     # Create spectrum documents
153 |     reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums]
154 | 
155 |     model_file = "references.model"
156 |     model = train_new_word2vec_model(reference_documents, iterations=[10, 20, 30], filename=model_file,
157 |                                      workers=2, progress_logger=True)
158 | 
159 | Once a word2vec model has been trained, spec2vec allows to calculate the similarities
160 | between mass spectrums based on this model. In cases where the word2vec model was
161 | trained on data different than the data it is applied for, a number of peaks ("words")
162 | might be unknown to the model (if they weren't part of the training dataset). To
163 | account for those cases it is important to specify the ``allowed_missing_percentage``,
164 | as in the example below.
165 | 
166 | .. code-block:: python
167 | 
168 |     import gensim
169 |     from matchms import calculate_scores
170 |     from spec2vec import Spec2Vec
171 | 
172 |     # query_spectrums loaded from files using https://matchms.readthedocs.io/en/latest/api/matchms.importing.load_from_mgf.html
173 |     query_spectrums = [spectrum_processing(s) for s in load_from_mgf("query_spectrums.mgf")]
174 | 
175 |     # Omit spectrums that didn't qualify for analysis
176 |     query_spectrums = [s for s in query_spectrums if s is not None]
177 | 
178 |     # Import pre-trained word2vec model (see code example above)
179 |     model_file = "references.model"
180 |     model = gensim.models.Word2Vec.load(model_file)
181 | 
182 |     # Define similarity_function
183 |     spec2vec_similarity = Spec2Vec(model=model, intensity_weighting_power=0.5,
184 |                                    allowed_missing_percentage=5.0)
185 | 
186 |     # Calculate scores on all combinations of reference spectrums and queries
187 |     scores = calculate_scores(reference_documents, query_spectrums, spec2vec_similarity)
188 | 
189 |     # Find the highest scores for a query spectrum of interest
190 |     best_matches = scores.scores_by_query(query_documents[0], sort=True)[:10]
191 | 
192 |     # Return highest scores
193 |     print([x[1] for x in best_matches])
194 | 
195 | 
196 | Glossary of terms
197 | =================
198 | 
199 | .. list-table::
200 |    :header-rows: 1
201 | 
202 |    * - Term
203 |      - Description
204 |    * - adduct / addition product
205 |      - During ionization in a mass spectrometer, the molecules of the injected compound break apart
206 |        into fragments. When fragments combine into a new compound, this is known as an addition
207 |        product, or adduct.  `Wikipedia <https://en.wikipedia.org/wiki/Adduct>`__
208 |    * - GNPS
209 |      - Knowledge base for sharing of mass spectrometry data (`link <https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash.jsp>`__).
210 |    * - InChI / :code:`INCHI`
211 |      - InChI is short for International Chemical Identifier. InChIs are useful
212 |        in retrieving information associated with a certain molecule from a
213 |        database.
214 |    * - InChIKey / InChI key / :code:`INCHIKEY`
215 |      - An indentifier for molecules. For example, the InChI key for carbon
216 |        dioxide is :code:`InChIKey=CURLTUGMZLYLDI-UHFFFAOYSA-N` (yes, it
217 |        includes the substring :code:`InChIKey=`).
218 |    * - MGF File / Mascot Generic Format
219 |      - A plan ASCII file format to store peak list data from a mass spectrometry experiment. Links: `matrixscience.com <http://www.matrixscience.com/help/data_file_help.html#GEN>`__,
220 |        `fiehnlab.ucdavis.edu <https://fiehnlab.ucdavis.edu/projects/lipidblast/mgf-files>`__.
221 |    * - parent mass / :code:`parent_mass`
222 |      - Actual mass (in Dalton) of the original compound prior to fragmentation.
223 |        It can be recalculated from the precursor m/z by taking
224 |        into account the charge state and proton/electron masses.
225 |    * - precursor m/z / :code:`precursor_mz`
226 |      - Mass-to-charge ratio of the compound targeted for fragmentation.
227 |    * - SMILES
228 |      - A line notation for describing the structure of chemical species using
229 |        short ASCII strings. For example, water is encoded as :code:`O[H]O`,
230 |        carbon dioxide is encoded as :code:`O=C=O`, etc. SMILES-encoded species may be converted to InChIKey `using a resolver like this one <https://cactus.nci.nih.gov/chemical/structure>`__. The Wikipedia entry for SMILES is `here <https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system>`__.
231 | 
232 | 
233 | ****************************
234 | Documentation for developers
235 | ****************************
236 | 
237 | Installation
238 | ============
239 | 
240 | To install spec2vec, do:
241 | 
242 | .. code-block:: console
243 | 
244 |   git clone https://github.com/iomega/spec2vec.git
245 |   cd spec2vec
246 |   conda env create --file conda/environment-dev.yml
247 |   conda activate spec2vec-dev
248 |   pip install --editable .
249 | 
250 | Run the linter with:
251 | 
252 | .. code-block:: console
253 | 
254 |   prospector
255 | 
256 | Run tests (including coverage) with:
257 | 
258 | .. code-block:: console
259 | 
260 |   pytest
261 | 
262 | 
263 | Conda package
264 | =============
265 | 
266 | The conda packaging is handled by a `recipe at Bioconda <https://github.com/bioconda/bioconda-recipes/blob/master/recipes/spec2vec/meta.yaml>`_.
267 | 
268 | Publishing to PyPI will trigger the creation of a `pull request on the bioconda recipes repository <https://github.com/bioconda/bioconda-recipes/pulls?q=is%3Apr+is%3Aopen+spec2vec>`_
269 | Once the PR is merged the new version of matchms will appear on `https://anaconda.org/bioconda/spec2vec <https://anaconda.org/bioconda/spec2vec>`_ 
270 | 
271 | 
272 | To remove spec2vec package from the active environment:
273 | 
274 | .. code-block:: console
275 | 
276 |   conda remove spec2vec
277 | 
278 | 
279 | To remove spec2vec environment:
280 | 
281 | .. code-block:: console
282 | 
283 |   conda env remove --name spec2vec
284 | 
285 | Contributing
286 | ============
287 | 
288 | If you want to contribute to the development of spec2vec,
289 | have a look at the `contribution guidelines <CONTRIBUTING.md>`_.
290 | 
291 | *******
292 | License
293 | *******
294 | 
295 | Copyright (c) 2023, Netherlands eScience Center & Düsseldorf University of Applied Sciences
296 | 
297 | Licensed under the Apache License, Version 2.0 (the "License");
298 | you may not use this file except in compliance with the License.
299 | You may obtain a copy of the License at
300 | 
301 | http://www.apache.org/licenses/LICENSE-2.0
302 | 
303 | Unless required by applicable law or agreed to in writing, software
304 | distributed under the License is distributed on an "AS IS" BASIS,
305 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
306 | See the License for the specific language governing permissions and
307 | limitations under the License.
308 | 
309 | *******
310 | Credits
311 | *******
312 | 
313 | This package was created with `Cookiecutter
314 | <https://github.com/audreyr/cookiecutter>`_ and the `NLeSC/python-template
315 | <https://github.com/NLeSC/python-template>`_.
316 | 


--------------------------------------------------------------------------------
/conda/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/conda/README.md


--------------------------------------------------------------------------------
/conda/environment-build.yml:
--------------------------------------------------------------------------------
1 | name: spec2vec-build
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - anaconda-client
6 |   - conda-build
7 |   - conda-verify
8 |   - python >=3.7
9 | 


--------------------------------------------------------------------------------
/conda/environment-dev.yml:
--------------------------------------------------------------------------------
 1 | name: spec2vec-dev
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 |   - nlesc
 7 | dependencies:
 8 |   - gensim >=4.2.0
 9 |   - matchms >=0.6.2
10 |   - numba >=0.51
11 |   - numpy
12 |   - pip
13 |   - python >=3.7
14 |   - scipy
15 |   - tqdm
16 |   - pip:
17 |     - -e ..[dev]
18 | 


--------------------------------------------------------------------------------
/conda/environment.yml:
--------------------------------------------------------------------------------
 1 | name: spec2vec
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - gensim >=4.2.0
 8 |   - matchms >=0.6.2
 9 |   - numba >=0.51
10 |   - numpy
11 |   - python >=3.7
12 |   - scipy
13 |   - tqdm
14 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "spec2vec" %}
 2 | {% set version = "0.8.1" %}
 3 | 
 4 | package:
 5 |   name: {{ name|lower }}
 6 |   version: {{ version }}
 7 | 
 8 | source:
 9 |   path: ..
10 | 
11 | extra:
12 |   channels:
13 |     - nlesc
14 |     - conda-forge
15 |     - bioconda
16 | 
17 | build:
18 |   noarch: python
19 |   preserve_egg_dir: True
20 |   number: 0
21 |   skip: True # [py2k]
22 |   script: {{ PYTHON }} -m pip install --no-deps --ignore-installed . -vv
23 | 
24 | requirements:
25 |   build:
26 |     - conda-build
27 |     - conda-verify
28 |     - pytest-runner
29 |     - python
30 |     - matchms >=0.6.2
31 |     - numpy {{ numpy }}
32 |     - setuptools
33 |   host:
34 |     - python >=3.7
35 |     - pip
36 |     - pytest-runner
37 |     - setuptools
38 |   run:
39 |     - gensim >=4.2.0
40 |     - matchms >=0.14.0, <=0.26.4
41 |     - numba >=0.51
42 |     - numpy
43 |     - pip
44 |     - python >=3.7
45 |     - scipy <=1.10.1
46 |     - tqdm
47 | 
48 | test:
49 |   imports:
50 |     - spec2vec
51 | 
52 | about:
53 |   home: https://github.com/iomega/spec2vec
54 |   license: Apache-2.0
55 |   license_family: APACHE
56 |   license_file: LICENSE
57 |   summary: Word2Vec based similarity measure of mass spectrometry data.
58 |   description: Word2Vec based similarity measure of mass spectrometry data.
59 |   doc_url: https://spec2vec.readthedocs.io/
60 |   dev_url: https://github.com/iomega/spec2vec
61 | 
62 | extra:
63 |   recipe-maintainers:
64 |     - fdiblen
65 |     - florian-huber
66 | 


--------------------------------------------------------------------------------
/integration-tests/test_user_workflow_spec2vec.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/integration-tests/test_user_workflow_spec2vec.model


--------------------------------------------------------------------------------
/integration-tests/test_user_workflow_spec2vec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gensim
 3 | import numpy as np
 4 | from matchms import calculate_scores
 5 | from matchms.filtering import (add_losses, add_parent_mass, default_filters,
 6 |                                normalize_intensities,
 7 |                                reduce_to_number_of_peaks,
 8 |                                require_minimum_number_of_peaks, select_by_mz)
 9 | from matchms.importing import load_from_mgf
10 | from spec2vec import Spec2Vec, SpectrumDocument
11 | 
12 | 
13 | def test_user_workflow_spec2vec():
14 |     """Test typical user workflow to get from mass spectra to spec2vec similarities.
15 | 
16 |     This test will run a typical workflow example using a small dataset and a
17 |     pretrained word2vec model. One main aspect of this is to test if users will
18 |     get exactly the same spec2vec similarity scores when starting from a word2vec
19 |     model that was trained and saved elsewhere.
20 |     """
21 |     def apply_my_filters(s):
22 |         """This is how a user would typically design his own pre- and post-
23 |         processing pipeline."""
24 |         s = default_filters(s)
25 |         s = add_parent_mass(s)
26 |         s = normalize_intensities(s)
27 |         s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
28 |         s = select_by_mz(s, mz_from=0, mz_to=1000)
29 |         s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
30 |         s = require_minimum_number_of_peaks(s, n_required=5)
31 |         return s
32 | 
33 |     repository_root = os.path.join(os.path.dirname(__file__), "..")
34 |     spectrums_file = os.path.join(repository_root, "tests", "data", "pesticides.mgf")
35 | 
36 |     # apply my filters to the data
37 |     spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]
38 | 
39 |     # omit spectrums that didn't qualify for analysis
40 |     spectrums = [s for s in spectrums if s is not None]
41 | 
42 |     # convert spectrums to spectrum 'documents'
43 |     documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums]
44 | 
45 |     model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model")
46 |     if os.path.isfile(model_file):
47 |         model = gensim.models.Word2Vec.load(model_file)
48 |     else:
49 |         # create and train model
50 |         model = gensim.models.Word2Vec([d.words for d in documents], size=5, min_count=1)
51 |         model.train([d.words for d in documents], total_examples=len(documents), epochs=20)
52 |         model.save(model_file)
53 | 
54 |     # define similarity_function
55 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
56 | 
57 |     references = documents[:26]
58 |     queries = documents[25:]
59 | 
60 |     # calculate scores on all combinations of references and queries
61 |     scores = list(calculate_scores(references, queries, spec2vec))
62 | 
63 |     # filter out self-comparisons
64 |     filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query]
65 | 
66 |     sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)
67 | 
68 |     actual_top10 = sorted_by_score[:10]
69 | 
70 |     expected_top10 = [
71 |         (documents[19], documents[25], 0.9999121928249473),
72 |         (documents[20], documents[25], 0.9998846890269892),
73 |         (documents[20], documents[45], 0.9998756073673759),
74 |         (documents[25], documents[45], 0.9998750427994474),
75 |         (documents[19], documents[27], 0.9998722768460854),
76 |         (documents[22], documents[27], 0.9998633023352553),
77 |         (documents[18], documents[27], 0.9998616961532616),
78 |         (documents[19], documents[45], 0.9998528723697396),
79 |         (documents[14], documents[71], 0.9998404364805897),
80 |         (documents[20], documents[27], 0.9998336807761137)
81 |     ]
82 | 
83 |     assert [x[0] for x in actual_top10] == [x[0] for x in expected_top10]
84 |     assert [x[1] for x in actual_top10] == [x[1] for x in expected_top10]
85 |     assert np.allclose([x[2][0] for x in actual_top10], [x[2] for x in expected_top10]), "Expected different top 10 table."
86 | 


--------------------------------------------------------------------------------
/readthedocs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/readthedocs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import os
 14 | import sys
 15 | import spec2vec
 16 | 
 17 | 
 18 | d = os.path.dirname(os.path.realpath(__file__))
 19 | sys.path.insert(0, os.path.join(d, ".."))
 20 | 
 21 | 
 22 | # -- Project information -----------------------------------------------------
 23 | 
 24 | project = "spec2vec"
 25 | copyright = "2020, Netherlands eScience Center"
 26 | author = "Netherlands eScience Center"
 27 | 
 28 | 
 29 | # -- General configuration ---------------------------------------------------
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom
 33 | # ones.
 34 | extensions = [
 35 |     "sphinx.ext.autodoc",
 36 |     "sphinx.ext.coverage",
 37 |     "sphinx.ext.intersphinx",
 38 |     "sphinx.ext.viewcode",
 39 |     "sphinx.ext.todo",
 40 |     "sphinx.ext.doctest",
 41 |     "sphinxcontrib.apidoc",
 42 |     "sphinx.ext.napoleon",
 43 | ]
 44 | 
 45 | apidoc_module_dir = "../spec2vec"
 46 | apidoc_output_dir = "./api"
 47 | apidoc_excluded_paths = ["tests", "readthedocs"]
 48 | apidoc_separate_modules = True
 49 | apidoc_module_first = True
 50 | # Hide undocumented member by excluding default undoc-members option
 51 | os.environ["SPHINX_APIDOC_OPTIONS"] = "members,show-inheritance"
 52 | 
 53 | # Add any paths that contain templates here, relative to this directory.
 54 | templates_path = ["_templates"]
 55 | 
 56 | # The language for content autogenerated by Sphinx. Refer to documentation
 57 | # for a list of supported languages.
 58 | #
 59 | # This is also used if you do content translation via gettext catalogs.
 60 | # Usually you set "language" from the command line for these cases.
 61 | language = "en"
 62 | 
 63 | # List of patterns, relative to source directory, that match files and
 64 | # directories to ignore when looking for source files.
 65 | # This pattern also affects html_static_path and html_extra_path.
 66 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "readthedocs/conf.rst"]
 67 | 
 68 | # Include class __init__ and __call__ docstrings.
 69 | autodoc_default_options = {
 70 |     'special-members': '__init__,__call__',
 71 | }
 72 | 
 73 | # -- Options for HTML output -------------------------------------------------
 74 | 
 75 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 76 | # a list of builtin themes.
 77 | #
 78 | html_theme = "alabaster"
 79 | 
 80 | html_theme_options = {
 81 |     "github_user": "spec2vec",
 82 |     "github_repo": "spec2vec",
 83 | }
 84 | 
 85 | # Add any paths that contain custom static files (such as style sheets) here,
 86 | # relative to this directory. They are copied after the builtin static files,
 87 | # so a file named "default.css" will overwrite the builtin "default.css".
 88 | html_static_path = []
 89 | 
 90 | # -- Extension configuration -------------------------------------------------
 91 | 
 92 | # -- Options for todo extension ----------------------------------------------
 93 | 
 94 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 95 | todo_include_todos = True
 96 | 
 97 | # -- Options for intersphinx extension ----------------------------------------------
 98 | 
 99 | intersphinx_mapping = {
100 |     "https://docs.python.org/3": None,
101 |     "numpy": ("https://docs.scipy.org/doc/numpy", None),
102 |     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
103 |     "gensim": ("https://radimrehurek.com/gensim", None),
104 |     "matchms": ("https://matchms.readthedocs.io/en/latest", None),
105 | }
106 | 


--------------------------------------------------------------------------------
/readthedocs/index.rst:
--------------------------------------------------------------------------------
  1 | .. spec2vec documentation master file, created by
  2 |    sphinx-quickstart on Tue Apr  7 09:16:44 2020.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to spec2vec's documentation!
  7 | ====================================
  8 | 
  9 | Word2Vec based similarity measure of mass spectrometry data.
 10 | 
 11 | .. toctree::
 12 |    :maxdepth: 3
 13 |    :caption: Contents:
 14 | 
 15 |    API <api/spec2vec.rst>
 16 | 
 17 | Installation
 18 | ============
 19 | 
 20 | Prerequisites:  
 21 | 
 22 | - Python 3.7 or 3.8  
 23 | - Recommended: Anaconda
 24 | 
 25 | We recommend installing spec2vec from Anaconda Cloud with
 26 | 
 27 | .. code-block:: console
 28 | 
 29 |   # install spec2vec in a new virtual environment to avoid dependency clashes
 30 |   conda create --name spec2vec python=3.8
 31 |   conda activate spec2vec
 32 |   conda install --channel nlesc --channel bioconda --channel conda-forge spec2vec
 33 | 
 34 | Alternatively, spec2vec can also be installed using ``pip``. When using spec2vec together with ``matchms`` it is important to note that only the Anaconda install will make sure that also ``rdkit`` is installed properly, which is requried for a few matchms filter functions (it is not required for any spec2vec related functionalities though).
 35 | 
 36 | .. code-block:: console
 37 | 
 38 |   pip install spec2vec
 39 | 
 40 | Examples
 41 | ========
 42 | 
 43 | Train a word2vec model
 44 | **********************
 45 | Below a code example of how to process a large data set of reference spectra to
 46 | train a word2vec model from scratch. Spectra are converted to documents using :py:class:`~spec2vec.SpectrumDocument` which converts spectrum peaks into "words" according to their m/z ratio (for instance ``peak@100.39``). A new word2vec model can then trained using :py:func:`~spec2vec.model_building.train_new_word2vec_model` which will set the training parameters to spec2vec defaults unless specified otherwise. Word2Vec models learn from co-occurences of peaks ("words") across many different spectra.
 47 | To get a model that can give a meaningful representation of a set of
 48 | given spectra it is desirable to train the model on a large and representative
 49 | dataset.
 50 | 
 51 | .. code-block:: python
 52 | 
 53 |     import os
 54 |     from matchms.filtering import add_losses
 55 |     from matchms.filtering import add_parent_mass
 56 |     from matchms.filtering import default_filters
 57 |     from matchms.filtering import normalize_intensities
 58 |     from matchms.filtering import reduce_to_number_of_peaks
 59 |     from matchms.filtering import require_minimum_number_of_peaks
 60 |     from matchms.filtering import select_by_mz
 61 |     from matchms.importing import load_from_mgf
 62 |     from spec2vec import SpectrumDocument
 63 |     from spec2vec.model_building import train_new_word2vec_model
 64 | 
 65 |     def spectrum_processing(s):
 66 |         """This is how one would typically design a desired pre- and post-
 67 |         processing pipeline."""
 68 |         s = default_filters(s)
 69 |         s = add_parent_mass(s)
 70 |         s = normalize_intensities(s)
 71 |         s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5, n_max=500)
 72 |         s = select_by_mz(s, mz_from=0, mz_to=1000)
 73 |         s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
 74 |         s = require_minimum_number_of_peaks(s, n_required=10)
 75 |         return s
 76 | 
 77 |     # Load data from MGF file and apply filters
 78 |     spectrums = [spectrum_processing(s) for s in load_from_mgf("reference_spectrums.mgf")]
 79 | 
 80 |     # Omit spectrums that didn't qualify for analysis
 81 |     spectrums = [s for s in spectrums if s is not None]
 82 | 
 83 |     # Create spectrum documents
 84 |     reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums]
 85 | 
 86 |     model_file = "references.model"
 87 |     model = train_new_word2vec_model(reference_documents, model_file, iterations=[10, 20, 30],
 88 |                                      workers=2, progress_logger=True)
 89 | 
 90 | Derive spec2vec similarity scores
 91 | *********************************
 92 | Once a word2vec model has been trained, spec2vec allows to calculate the similarities
 93 | between mass spectrums based on this model. In cases where the word2vec model was
 94 | trained on data different than the data it is applied for, a number of peaks ("words")
 95 | might be unknown to the model (if they weren't part of the training dataset). To
 96 | account for those cases it is important to specify the ``allowed_missing_percentage``,
 97 | as in the example below.
 98 | 
 99 | .. code-block:: python
100 | 
101 |     import gensim
102 |     from matchms import calculate_scores
103 |     from spec2vec import Spec2Vec
104 | 
105 |     # query_spectrums loaded from files using https://matchms.readthedocs.io/en/latest/api/matchms.importing.load_from_mgf.html
106 |     query_spectrums = [spectrum_processing(s) for s in load_from_mgf("query_spectrums.mgf")]
107 | 
108 |     # Omit spectrums that didn't qualify for analysis
109 |     query_spectrums = [s for s in query_spectrums if s is not None]
110 | 
111 |     # Import pre-trained word2vec model (see code example above)
112 |     model_file = "references.model"
113 |     model = gensim.models.Word2Vec.load(model_file)
114 | 
115 |     # Define similarity_function
116 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5,
117 |                         allowed_missing_percentage=5.0)
118 | 
119 |     # Calculate scores on all combinations of reference spectrums and queries
120 |     scores = calculate_scores(reference_documents, query_spectrums, spec2vec)
121 | 
122 |     # Find the highest scores for a query spectrum of interest
123 |     best_matches = scores.scores_by_query(query_documents[0], sort=True)[:10]
124 | 
125 |     # Return highest scores
126 |     print([x[1] for x in best_matches])
127 | 
128 | Indices and tables
129 | ==================
130 | 
131 | * :ref:`genindex`
132 | * :ref:`modindex`
133 | * :ref:`search`
134 | 


--------------------------------------------------------------------------------
/readthedocs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.8.0
 3 | 
 4 | [bumpversion:file:conda/meta.yaml]
 5 | search = set version = "{current_version}"
 6 | replace = set version = "{new_version}"
 7 | 
 8 | [bumpversion:file:spec2vec/__version__.py]
 9 | search = __version__ = '{current_version}'
10 | replace = __version__ = '{new_version}'
11 | 
12 | [isort]
13 | sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
14 | no_lines_before = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
15 | lines_after_imports = 2
16 | 
17 | [metadata]
18 | description-file = README.rst
19 | 
20 | [aliases]
21 | test = pytest
22 | 
23 | [coverage:run]
24 | branch = True
25 | source = spec2vec
26 | 
27 | [tool:pytest]
28 | testpaths = tests integration-tests
29 | python_classes = *TestSuite
30 | junit_family = xunit2
31 | 
32 | [build_sphinx]
33 | source-dir = docs
34 | build-dir = docs/_build
35 | all_files = 1
36 | builder = html
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | here = os.path.abspath(os.path.dirname(__file__))
 7 | 
 8 | version = {}
 9 | with open(os.path.join(here, "spec2vec", "__version__.py")) as f:
10 |     exec(f.read(), version)
11 | 
12 | with open("README.rst") as readme_file:
13 |     readme = readme_file.read()
14 | 
15 | setup(
16 |     name="spec2vec",
17 |     version=version["__version__"],
18 |     description="Word2Vec based similarity measure of mass spectrometry data.",
19 |     long_description=readme,
20 |     long_description_content_type="text/x-rst",
21 |     author="Spec2Vec developer team",
22 |     author_email="florian.huber@hs-duesseldorf.de",
23 |     url="https://github.com/iomega/spec2vec",
24 |     packages=find_packages(),
25 |     include_package_data=True,
26 |     license="Apache Software License 2.0",
27 |     zip_safe=False,
28 |     keywords=[
29 |         "word2vec",
30 |         "mass spectrometry",
31 |         "fuzzy matching",
32 |         "fuzzy search"
33 |     ],
34 |     classifiers=[
35 |         "Development Status :: 4 - Beta",
36 |         "Intended Audience :: Education",
37 |         "Intended Audience :: Science/Research",
38 |         "Intended Audience :: Developers",
39 |         "License :: OSI Approved :: Apache Software License",
40 |         "Natural Language :: English",
41 |         "Programming Language :: Python :: 3",
42 |         "Programming Language :: Python :: 3.7",
43 |         "Programming Language :: Python :: 3.8",
44 |         "Programming Language :: Python :: 3.9",
45 |     ],
46 |     test_suite="tests",
47 |     python_requires='>=3.7',
48 |     install_requires=[
49 |         "gensim >=4.2.0",
50 |         "matchms >=0.14.0,<=0.26.4",
51 |         "numba >=0.51",
52 |         "numpy",
53 |         "scipy <=1.10.1",
54 |         "tqdm",
55 |     ],
56 |     extras_require={"dev": ["bump2version",
57 |                             "isort>=5.1.0",
58 |                             "pylint<2.12.0",
59 |                             "prospector[with_pyroma]",
60 |                             "pytest",
61 |                             "pytest-cov",
62 |                             "sphinx>=4.0.0",
63 |                             "sphinx_rtd_theme",
64 |                             "sphinxcontrib-apidoc",
65 |                             "yapf",],
66 |     }
67 | )
68 | 


--------------------------------------------------------------------------------
/sonar-project.properties:
--------------------------------------------------------------------------------
 1 | sonar.organization=iomega
 2 | sonar.projectKey=iomega_spec2vec
 3 | sonar.host.url=https://sonarcloud.io
 4 | sonar.sources=spec2vec/
 5 | sonar.tests=tests/,integration-tests/
 6 | sonar.links.homepage=https://github.com/iomega/spec2vec
 7 | sonar.links.scm=https://github.com/iomega/spec2vec
 8 | sonar.links.issue=https://github.com/iomega/spec2vec/issues
 9 | sonar.links.ci=https://github.com/iomega/spec2vec/actions
10 | sonar.python.coverage.reportPaths=coverage.xml
11 | sonar.python.xunit.reportPath=xunit-result.xml
12 | sonar.python.pylint.reportPath=pylint-report.txt
13 | 


--------------------------------------------------------------------------------
/spec2vec/Document.py:
--------------------------------------------------------------------------------
 1 | class Document:
 2 |     """Parent class for documents as required by spec2vec.
 3 | 
 4 |     Use this as parent class to build your own document class. An example used for
 5 |     mass spectra is SpectrumDocument."""
 6 |     def __init__(self, obj):
 7 |         """
 8 | 
 9 |         Parameters
10 |         ----------
11 |         obj:
12 |             Input object of desired class.
13 |         """
14 |         self._obj = obj
15 |         self._index = 0
16 |         self._make_words()
17 | 
18 |     def __iter__(self):
19 |         return self
20 | 
21 |     def __len__(self):
22 |         return len(self.words)
23 | 
24 |     def __next__(self):
25 |         """gensim.models.Word2Vec() wants its corpus elements to be iterable"""
26 |         if self._index < len(self.words):
27 |             word = self.words[self._index]
28 |             self._index += 1
29 |             return word
30 |         self._index = 0
31 |         raise StopIteration
32 | 
33 |     def __str__(self):
34 |         return self.words.__str__()
35 | 
36 |     def _make_words(self):
37 |         print("You should override this method in your own subclass.")
38 |         self.words = []
39 |         return self
40 | 


--------------------------------------------------------------------------------
/spec2vec/Spec2Vec.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Union
  3 | import numpy as np
  4 | from gensim.models import Word2Vec
  5 | from matchms import Spectrum
  6 | from matchms.similarity.BaseSimilarity import BaseSimilarity
  7 | from tqdm import tqdm
  8 | from spec2vec.serialization import Word2VecLight
  9 | from spec2vec.SpectrumDocument import SpectrumDocument
 10 | from spec2vec.vector_operations import (calc_vector, cosine_similarity,
 11 |                                         cosine_similarity_matrix)
 12 | 
 13 | 
 14 | class Spec2Vec(BaseSimilarity):
 15 |     """Calculate spec2vec similarity scores between a reference and a query.
 16 | 
 17 |     Using a trained model, spectrum documents will be converted into spectrum
 18 |     vectors. The spec2vec similarity is then the cosine similarity score between
 19 |     two spectrum vectors.
 20 | 
 21 |     The following code example shows how to calculate spec2vec similarities
 22 |     between query and reference spectrums. It uses a dummy model that can be found at
 23 |     :download:`../integration-tests/test_user_workflow_spec2vec.model </../integration-tests/test_user_workflow_spec2vec.model>`
 24 |     and a small test dataset that can be found at
 25 |     :download:`../tests/pesticides.mgf </../tests/pesticides.mgf>`.
 26 | 
 27 |     .. testcode::
 28 | 
 29 |         import os
 30 |         import gensim
 31 |         from matchms import calculate_scores
 32 |         from matchms.filtering import add_losses
 33 |         from matchms.filtering import default_filters
 34 |         from matchms.filtering import normalize_intensities
 35 |         from matchms.filtering import require_minimum_number_of_peaks
 36 |         from matchms.filtering import select_by_intensity
 37 |         from matchms.filtering import select_by_mz
 38 |         from matchms.importing import load_from_mgf
 39 |         from spec2vec import Spec2Vec
 40 | 
 41 |         def spectrum_processing(s):
 42 |             '''This is how a user would typically design his own pre- and post-
 43 |             processing pipeline.'''
 44 |             s = default_filters(s)
 45 |             s = normalize_intensities(s)
 46 |             s = select_by_mz(s, mz_from=0, mz_to=1000)
 47 |             s = select_by_intensity(s, intensity_from=0.01)
 48 |             s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
 49 |             s = require_minimum_number_of_peaks(s, n_required=5)
 50 |             return s
 51 | 
 52 |         spectrums_file = os.path.join(os.getcwd(), "..", "tests", "data", "pesticides.mgf")
 53 | 
 54 |         # Load data and apply the above defined filters to the data
 55 |         spectrums = [spectrum_processing(s) for s in load_from_mgf(spectrums_file)]
 56 | 
 57 |         # Omit spectrums that didn't qualify for analysis
 58 |         spectrums = [s for s in spectrums if s is not None]
 59 | 
 60 |         # Load pretrained model (here dummy model)
 61 |         model_file = os.path.join(os.getcwd(), "..", "integration-tests", "test_user_workflow_spec2vec.model")
 62 |         model = gensim.models.Word2Vec.load(model_file)
 63 | 
 64 |         # Define similarity_function
 65 |         spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
 66 | 
 67 |         # Calculate scores on all combinations of references and queries
 68 |         scores = calculate_scores(spectrums[10:], spectrums[:10], spec2vec)
 69 | 
 70 |         # Select top-10 candidates for first query spectrum
 71 |         spectrum0_top10 = scores.scores_by_query(spectrums[0], sort=True)[:10]
 72 | 
 73 |         # Display spectrum IDs for top-10 matches (only works if metadata contains "spectrum_id" field)
 74 |         print([s[0].metadata['spectrum_id'] for s in spectrum0_top10])
 75 | 
 76 |     Should output
 77 | 
 78 |     .. testoutput::
 79 | 
 80 |         ['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ...
 81 | 
 82 |     """
 83 |     def __init__(self, model: Union[Word2Vec, Word2VecLight], intensity_weighting_power: Union[float, int] = 0,
 84 |                  allowed_missing_percentage: Union[float, int] = 10, progress_bar: bool = False):
 85 |         """
 86 | 
 87 |         Parameters
 88 |         ----------
 89 |         model:
 90 |             Expected input is a gensim word2vec model that has been trained on
 91 |             the desired set of spectrum documents.
 92 |         intensity_weighting_power:
 93 |             Spectrum vectors are a weighted sum of the word vectors. The given
 94 |             word intensities will be raised to the given power.
 95 |             The default is 0, which means that no weighing will be done.
 96 |         allowed_missing_percentage:
 97 |             Set the maximum allowed percentage of the document that may be missing
 98 |             from the input model. This is measured as percentage of the weighted, missing
 99 |             words compared to all word vectors of the document. Default is 10, which
100 |             means up to 10% missing words are allowed. If more words are missing from
101 |             the model, an empty embedding will be returned (leading to similarities of 0)
102 |             and a warning is raised.
103 |         progress_bar:
104 |             Set to True to monitor the embedding creating with a progress bar.
105 |             Default is False.
106 |         """
107 |         self.model = model
108 |         self.n_decimals = self._get_word_decimals(self.model)
109 |         self.intensity_weighting_power = intensity_weighting_power
110 |         self.allowed_missing_percentage = allowed_missing_percentage
111 |         self.vector_size = model.wv.vector_size
112 |         self.disable_progress_bar = not progress_bar
113 | 
114 |     def pair(self, reference: Union[SpectrumDocument, Spectrum],
115 |              query: Union[SpectrumDocument, Spectrum]) -> float:
116 |         """Calculate the spec2vec similaritiy between a reference and a query.
117 | 
118 |         Parameters
119 |         ----------
120 |         reference:
121 |             Reference spectrum or spectrum document.
122 |         query:
123 |             Query spectrum or spectrum document.
124 | 
125 |         Returns
126 |         -------
127 |         spec2vec_similarity
128 |             Spec2vec similarity score.
129 |         """
130 |         reference_vector = self._calculate_embedding(reference)
131 |         query_vector = self._calculate_embedding(query)
132 | 
133 |         return cosine_similarity(reference_vector, query_vector)
134 | 
135 |     def matrix(self, references: Union[List[SpectrumDocument], List[Spectrum]],
136 |                queries: Union[List[SpectrumDocument], List[Spectrum]],
137 |                array_type: str = "numpy",
138 |                is_symmetric: bool = False) -> np.ndarray:
139 |         """Calculate the spec2vec similarities between all references and queries.
140 | 
141 |         Parameters
142 |         ----------
143 |         references:
144 |             Reference spectrums or spectrum documents.
145 |         queries:
146 |             Query spectrums or spectrum documents.
147 |         array_type
148 |             Specify the output array type. Can be "numpy" or "sparse".
149 |             Currently, only "numpy" is supported and will return a numpy array.
150 |             Future versions will include "sparse" as option to return a COO-sparse array.
151 |         is_symmetric:
152 |             Set to True if references == queries to speed up calculation about 2x.
153 |             Uses the fact that in this case score[i, j] = score[j, i]. Default is False.
154 | 
155 |         Returns
156 |         -------
157 |         spec2vec_similarity
158 |             Array of spec2vec similarity scores.
159 |         """
160 |         n_rows = len(references)
161 |         reference_vectors = np.empty((n_rows, self.vector_size), dtype="float")
162 |         for index_reference, reference in enumerate(tqdm(references, desc='Calculating vectors of reference spectrums',
163 |                                                          disable=self.disable_progress_bar)):
164 |             reference_vectors[index_reference, 0:self.vector_size] = self._calculate_embedding(reference)
165 | 
166 |         n_cols = len(queries)
167 |         if is_symmetric:
168 |             assert np.all(references == queries), \
169 |                 "Expected references to be equal to queries for is_symmetric=True"
170 |             query_vectors = reference_vectors
171 |         else:
172 |             query_vectors = np.empty((n_cols, self.vector_size), dtype="float")
173 |             for index_query, query in enumerate(tqdm(queries, desc='Calculating vectors of query spectrums',
174 |                                                      disable=self.disable_progress_bar)):
175 |                 query_vectors[index_query, 0:self.vector_size] = self._calculate_embedding(query)
176 | 
177 |         spec2vec_similarity = cosine_similarity_matrix(reference_vectors, query_vectors)
178 | 
179 |         return spec2vec_similarity
180 | 
181 |     @staticmethod
182 |     def _get_word_decimals(model):
183 |         """Read the decimal rounding that was used to train the model"""
184 |         word_regex = r"[a-z]{4}@[0-9]{1,5}."
185 |         example_word = next(iter(model.wv.key_to_index))
186 | 
187 |         return len(re.split(word_regex, example_word)[-1])
188 | 
189 |     def _calculate_embedding(self, spectrum_in: Union[SpectrumDocument, Spectrum]):
190 |         """Generate Spec2Vec embedding vectors from input spectrum (or SpectrumDocument)"""
191 |         if isinstance(spectrum_in, Spectrum):
192 |             spectrum_in = SpectrumDocument(spectrum_in, n_decimals=self.n_decimals)
193 |         elif isinstance(spectrum_in, SpectrumDocument):
194 |             assert spectrum_in.n_decimals == self.n_decimals, \
195 |                 "Decimal rounding of input data does not agree with model vocabulary."
196 |         else:
197 |             raise ValueError("Expected input type to be Spectrum or SpectrumDocument")
198 |         return calc_vector(self.model,
199 |                            spectrum_in,
200 |                            self.intensity_weighting_power,
201 |                            self.allowed_missing_percentage)
202 | 


--------------------------------------------------------------------------------
/spec2vec/SpectrumDocument.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from matchms.Spikes import Spikes
  3 | from .Document import Document
  4 | 
  5 | 
  6 | class SpectrumDocument(Document):
  7 |     """Create documents from spectra.
  8 | 
  9 |     Every peak (and loss) positions (m/z value) will be converted into a string "word".
 10 |     The entire list of all peak words forms a spectrum document. Peak words have
 11 |     the form "peak@100.32" (for n_decimals=2), and losses have the format "loss@100.32".
 12 |     Peaks with identical resulting strings will not be merged, hence same words can
 13 |     exist multiple times in a document (e.g. peaks at 100.31 and 100.29 would lead to
 14 |     two words "peak@100.3" when using n_decimals=1).
 15 | 
 16 |     For example:
 17 | 
 18 |     .. testcode::
 19 | 
 20 |         import numpy as np
 21 |         from matchms import Spectrum
 22 |         from spec2vec import SpectrumDocument
 23 | 
 24 |         spectrum = Spectrum(mz=np.array([100.0, 150.0, 200.51]),
 25 |                             intensities=np.array([0.7, 0.2, 0.1]),
 26 |                             metadata={'compound_name': 'substance1'})
 27 |         spectrum_document = SpectrumDocument(spectrum, n_decimals=1)
 28 | 
 29 |         print(spectrum_document.words)
 30 |         print(spectrum_document.peaks.mz)
 31 |         print(spectrum_document.get("compound_name"))
 32 | 
 33 |     Should output
 34 | 
 35 |     .. testoutput::
 36 | 
 37 |         ['peak@100.0', 'peak@150.0', 'peak@200.5']
 38 |         [100.   150.   200.51]
 39 |         substance1
 40 |     """
 41 |     def __init__(self, spectrum, n_decimals: int = 2):
 42 |         """
 43 | 
 44 |         Parameters
 45 |         ----------
 46 |         spectrum: SpectrumType
 47 |             Input spectrum.
 48 |         n_decimals
 49 |             Peak positions are converted to strings with n_decimal decimals.
 50 |             The default is 2, which would convert a peak at 100.387 into the
 51 |             word "peak@100.39".
 52 |         """
 53 |         self.n_decimals = n_decimals
 54 |         self.weights = None
 55 |         super().__init__(obj=spectrum)
 56 |         self._add_weights()
 57 | 
 58 |     def _make_words(self):
 59 |         """Create word from peaks (and losses)."""
 60 |         peak_words = [f"peak@{mz:.{self.n_decimals}f}" for mz in self._obj.peaks.mz]
 61 |         if self._obj.losses is not None:
 62 |             loss_words = [f"loss@{mz:.{self.n_decimals}f}" for mz in self._obj.losses.mz]
 63 |         else:
 64 |             loss_words = []
 65 |         self.words = peak_words + loss_words
 66 |         return self
 67 | 
 68 |     def _add_weights(self):
 69 |         """Add peaks (and loss) intensities as weights."""
 70 |         assert self._obj.peaks.intensities.max() <= 1, "peak intensities not normalized"
 71 | 
 72 |         peak_intensities = self._obj.peaks.intensities.tolist()
 73 |         if self._obj.losses is not None:
 74 |             loss_intensities = self._obj.losses.intensities.tolist()
 75 |         else:
 76 |             loss_intensities = []
 77 |         self.weights = peak_intensities + loss_intensities
 78 |         return self
 79 | 
 80 |     def get(self, key: str, default=None):
 81 |         """Retrieve value from Spectrum metadata dict. Shorthand for
 82 | 
 83 |         .. code-block:: python
 84 | 
 85 |             val = self._obj.metadata[key]
 86 | 
 87 |         """
 88 |         assert not hasattr(self, key), "Key cannot be attribute of SpectrumDocument class"
 89 |         return self._obj.get(key, default)
 90 | 
 91 |     @property
 92 |     def metadata(self):
 93 |         """Return metadata of original spectrum."""
 94 |         return self._obj.metadata
 95 | 
 96 |     @property
 97 |     def losses(self) -> Optional[Spikes]:
 98 |         """Return losses of original spectrum."""
 99 |         return self._obj.losses
100 | 
101 |     @property
102 |     def peaks(self) -> Spikes:
103 |         """Return peaks of original spectrum."""
104 |         return self._obj.peaks
105 | 


--------------------------------------------------------------------------------
/spec2vec/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import serialization
 2 | from .__version__ import __version__
 3 | from .Document import Document
 4 | from .logging_functions import _init_logger
 5 | from .Spec2Vec import Spec2Vec
 6 | from .SpectrumDocument import SpectrumDocument
 7 | from .vector_operations import calc_vector
 8 | 
 9 | 
10 | _init_logger()
11 | 
12 | 
13 | __all__ = [
14 |     "__version__",
15 |     "calc_vector",
16 |     "Document",
17 |     "serialization",
18 |     "SpectrumDocument",
19 |     "Spec2Vec",
20 | ]
21 | 


--------------------------------------------------------------------------------
/spec2vec/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.8.1'
2 | 


--------------------------------------------------------------------------------
/spec2vec/logging_functions.py:
--------------------------------------------------------------------------------
  1 | """Spec2Vec logger.
  2 | 
  3 | Spec2Vec functions and method report unexpected or undesired behavior as
  4 | logging WARNING, and additional information as INFO.
  5 | The default logging level is set to WARNING.
  6 | The logger is an adaptation of the matchms logger.
  7 | 
  8 | 
  9 | If you want to output additional
 10 | logging messages, you can lower the logging level to INFO using set_spec2vec_logger_level:
 11 | 
 12 | .. code-block:: python
 13 | 
 14 |     from spec2vec import set_spec2vec_logger_level
 15 | 
 16 |     set_spec2vec_logger_level("INFO")
 17 | 
 18 | This can also be combined with setting the matchms logger which occurs separately
 19 | by using set_matchms_logger_level:
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     from matchms import set_matchms_logger_level
 24 |     from spec2vec import set_spec2vec_logger_level
 25 | 
 26 |     set_matchms_logger_level("INFO")
 27 |     set_spec2vec_logger_level("INFO")
 28 | 
 29 | If you want to suppress logging warnings, you can also raise the logging level
 30 | to ERROR by:
 31 | 
 32 | .. code-block:: python
 33 | 
 34 |     set_spec2vec_logger_level("ERROR")
 35 | 
 36 | To write logging entries to a local file, you can do the following:
 37 | 
 38 | .. code-block:: python
 39 | 
 40 |     from spec2vec.logging_functions import add_logging_to_file
 41 | 
 42 |     add_logging_to_file("sample.log", loglevel="INFO")
 43 | 
 44 | If you want to write the logging messages to a local file while silencing the
 45 | stream of such messages, you can do the following:
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     from spec2vec.logging_functions import add_logging_to_file
 50 | 
 51 |     add_logging_to_file("sample.log", loglevel="INFO",
 52 |                         remove_stream_handlers=True)
 53 | 
 54 | """
 55 | import logging
 56 | import logging.config
 57 | import sys
 58 | import matchms.logging_functions as matchms_logging
 59 | 
 60 | 
 61 | _formatter = logging.Formatter(
 62 |     '%(asctime)s:%(levelname)s:%(name)s:%(module)s:%(message)s')
 63 | 
 64 | 
 65 | def _init_logger(logger_name="spec2vec"):
 66 |     """Initialize spec2vec logger."""
 67 |     logger = logging.getLogger(logger_name)
 68 |     logger.setLevel(logging.WARNING)
 69 |     handler = logging.StreamHandler(sys.stdout)
 70 |     handler.setLevel(logging.WARNING)
 71 |     handler.setFormatter(_formatter)
 72 |     logger.addHandler(handler)
 73 |     logger.info('Completed configuring spec2vec logger.')
 74 | 
 75 | 
 76 | def set_spec2vec_logger_level(loglevel: str, logger_name="spec2vec"):
 77 |     """Update logging level to given loglevel.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     loglevels
 82 |         Can be 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
 83 |     logger_name
 84 |         Default is "spec2vec". Change if logger name should be different.
 85 |     """
 86 |     matchms_logging.set_matchms_logger_level(loglevel=loglevel, logger_name=logger_name)
 87 | 
 88 | 
 89 | def add_logging_to_file(filename: str, loglevel: str = "INFO",
 90 |                         remove_stream_handlers: bool = False,
 91 |                         logger_name="spec2vec"):
 92 |     """Add logging to file.
 93 | 
 94 |     Current implementation does not change the initial logging stream,
 95 |     but simply adds a FileHandler to write logging entries to a file.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     filename
100 |         Name of file for write logging output.
101 |     loglevels
102 |         Can be 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
103 |     remove_stream_handlers
104 |         Set to True if only logging to file is desired.
105 |     logger_name
106 |         Default is "spec2vec". Change if logger name should be different.
107 |     """
108 |     matchms_logging.add_logging_to_file(filename=filename,
109 |                                         loglevel=loglevel,
110 |                                         remove_stream_handlers=remove_stream_handlers,
111 |                                         logger_name=logger_name)
112 | 
113 | 
114 | def reset_spec2vec_logger(logger_name="spec2vec"):
115 |     """Reset spec2vec logger to initial state.
116 | 
117 |     This will remove all logging Handlers and initialize a new spec2vec logger.
118 |     Use this function to reset previous changes made to the default spec2vec logger.
119 | 
120 |     Parameters
121 |     ----------
122 |     logger_name
123 |         Default is "spec2vec". Change if logger name should be different.
124 |     """
125 |     logger = logging.getLogger(logger_name)
126 |     logger.handlers.clear()
127 |     _init_logger()
128 | 


--------------------------------------------------------------------------------
/spec2vec/model_building.py:
--------------------------------------------------------------------------------
  1 | """This module contains functions that will help users to train a word2vec model
  2 | through gensim.
  3 | """
  4 | import logging
  5 | from typing import List, Tuple, Union
  6 | import gensim
  7 | from spec2vec.utils import ModelSaver, TrainingProgressLogger
  8 | 
  9 | 
 10 | logger = logging.getLogger("spec2vec")
 11 | 
 12 | 
 13 | def train_new_word2vec_model(documents: List, iterations: Union[List[int], int], filename: str = None,
 14 |                              progress_logger: bool = True, **settings) -> gensim.models.Word2Vec:
 15 |     """Train a new Word2Vec model (using gensim). Save to file if filename is given.
 16 | 
 17 |     Example code on how to train a word2vec model on a corpus (=list of documents)
 18 |     that is derived from a given set of spectrums (list of matchms.Spectrum instances):
 19 | 
 20 |     .. code-block:: python
 21 | 
 22 |         from matchms import SpectrumDocument
 23 |         from spec2vec.model_building import train_new_word2vec_model
 24 | 
 25 |         documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums]
 26 |         model = train_new_word2vec_model(documents, iterations=20, size=200,
 27 |                                          workers=1, progress_logger=False)
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     documents:
 32 |         List of documents, each document being a list of words (strings).
 33 |     iterations:
 34 |         Specifies the number of training interations. This can be done by setting
 35 |         iterations to the total number of training epochs (e.g. "iterations=15"),
 36 |         or by passing a list of iterations (e.g. "iterations=[5,10,15]") which will
 37 |         also led to a total training of max(iterations) epochs, but will save the
 38 |         model for every iteration in the list. Temporary models will be saved
 39 |         using the name: filename_TEMP_{#iteration}epoch.model".
 40 |     filename: str,
 41 |         Filename to save model. Default is None, which means no model will be saved.
 42 |         If a list of iterations is passed (e.g. "iterations=[5,10,15]"), then
 43 |         intermediate models will be saved during training (here after 5, 10
 44 |         iterations) using the pattern: filename_TEMP_{#iteration}epoch.model
 45 |     learning_rate_initial:
 46 |         Set initial learning rate. Default is 0.025.
 47 |     learning_rate_decay:
 48 |         After every epoch the learning rate will be lowered by the learning_rate_decay.
 49 |         Default is 0.00025.
 50 |     progress_logger:
 51 |         If True, the training progress will be printed every epoch. Default is True.
 52 |     **settings
 53 |         All other named arguments will be passed to the :py:class:`gensim.models.word2vec.Word2Vec` constructor.
 54 |     sg: int (0,1)
 55 |         For sg = 0 --> CBOW model, for sg = 1 --> skip gram model
 56 |         (see Gensim documentation). Default for Spec2Vec is 0.
 57 |     negative: int
 58 |         from Gensim:  If > 0, negative sampling will be used, the int for
 59 |         negative specifies how many “noise words” should be drawn (usually
 60 |         between 5-20). If set to 0, no negative sampling is used.
 61 |         Default for Spec2Vec is 5.
 62 |     size: int,
 63 |         Dimensions of word vectors. Default is 300.
 64 |     window: int,
 65 |         Window size for context words (small for local context, larger for
 66 |         global context). Spec2Vec expects large windwos. Default is 500.
 67 |     min_count: int,
 68 |         Only consider words that occur at least min_count times in the corpus.
 69 |         Default is 1.
 70 |     workers: int,
 71 |         Number of threads to run the training on (should not be more than
 72 |         number of cores/threads. Default is 4.
 73 | 
 74 |     Returns
 75 |     -------
 76 |     word2vec_model
 77 |         Gensim word2vec model.
 78 |     """
 79 |     settings = set_spec2vec_defaults(**settings)
 80 | 
 81 |     num_of_epochs = max(iterations) if isinstance(iterations, list) else iterations
 82 | 
 83 |     # Convert spec2vec style arguments to gensim style arguments
 84 |     settings = learning_rates_to_gensim_style(num_of_epochs, **settings)
 85 | 
 86 |     # Set callbacks
 87 |     callbacks = []
 88 |     if progress_logger:
 89 |         training_progress_logger = TrainingProgressLogger(num_of_epochs)
 90 |         callbacks.append(training_progress_logger)
 91 |     if filename:
 92 |         if isinstance(iterations, int):
 93 |             iterations = [iterations]
 94 |         model_saver = ModelSaver(num_of_epochs, iterations, filename)
 95 |         callbacks.append(model_saver)
 96 | 
 97 |     # Train word2vec model
 98 |     model = gensim.models.Word2Vec(documents, callbacks=callbacks, **settings)
 99 | 
100 |     return model
101 | 
102 | 
103 | def set_spec2vec_defaults(**settings):
104 |     """Set spec2vec default argument values"(where no user input is give)"."""
105 |     defaults = {
106 |         "sg": 0,
107 |         "negative": 5,
108 |         "vector_size": 300,
109 |         "window": 500,
110 |         "min_count": 1,
111 |         "learning_rate_initial": 0.025,
112 |         "learning_rate_decay": 0.00025,
113 |         "workers": 4,
114 |         "compute_loss": True,
115 |     }
116 |     assert "min_alpha" not in settings, "Expect 'learning_rate_decay' to describe learning rate decrease."
117 |     assert "alpha" not in settings, "Expect 'learning_rate_initial' instead of 'alpha'."
118 | 
119 |     # Set default parameters or replace by **settings input
120 |     for key, value in defaults.items():
121 |         if key in settings:
122 |             msg = f"The value of {key} is set from {value} (default) to {settings[key]}"
123 |             logger.info(msg)
124 |         else:
125 |             settings[key] = value
126 |     return settings
127 | 
128 | 
129 | def learning_rates_to_gensim_style(num_of_epochs, **settings):
130 |     """Convert "learning_rate_initial" and "learning_rate_decay" to gensim
131 |     "alpha" and "min_alpha"."""
132 |     alpha, min_alpha = set_learning_rate_decay(settings["learning_rate_initial"],
133 |                                                settings["learning_rate_decay"], num_of_epochs)
134 |     settings["alpha"] = alpha
135 |     settings["min_alpha"] = min_alpha
136 |     settings["epochs"] = num_of_epochs
137 | 
138 |     # Remove non-Gensim arguments from settings
139 |     del settings["learning_rate_initial"]
140 |     del settings["learning_rate_decay"]
141 |     return settings
142 | 
143 | 
144 | def set_learning_rate_decay(learning_rate_initial: float, learning_rate_decay: float,
145 |                             num_of_epochs: int) -> Tuple[float, float]:
146 |     """The learning rate in Gensim model training is defined by an initial rate
147 |     (alpha) and a final rate (min_alpha). which can be unintuitive. Here those
148 |     parameters will be set based on the given values for learning_rate_initial,
149 |     num_of_epochs, and learning_rate_decay.
150 | 
151 |     Parameters
152 |     ----------
153 |     learning_rate_initial:
154 |         Set initial learning rate.
155 |     learning_rate_decay:
156 |         After evert epoch, the learning rate will be lowered by the learning_rate_decay.
157 |     number_of_epochs:
158 |         Total number of epochs for training.
159 | 
160 |     Returns:
161 |     --------
162 |     alpha:
163 |         Initial learning rate.
164 |     min_alpha:
165 |         Final learning rate.
166 |     """
167 |     min_alpha = learning_rate_initial - num_of_epochs * learning_rate_decay
168 |     if min_alpha < 0:
169 |         msg = ("Number of total iterations is too high for given learning_rate decay.",
170 |                f"Learning_rate_decay will be set from {learning_rate_decay} ",
171 |                "to {learning_rate_initial/num_of_epochs}.")
172 |         logger.warning(msg)
173 |         min_alpha = 0
174 |     return learning_rate_initial, min_alpha
175 | 


--------------------------------------------------------------------------------
/spec2vec/serialization/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions for exporting and importing trained :class:`~gensim.models.Word2Vec` model to and from disk.
 3 | ##########################################
 4 | Functions provide the ability to export and import trained :class:`~gensim.models.Word2Vec` model to and from disk
 5 | without pickling the model. The model can be stored in two files: `.json` for metadata and `.npy` for weights.
 6 | """
 7 | from .model_exporting import export_model
 8 | from .model_importing import Word2VecLight, import_model
 9 | 
10 | 
11 | __all__ = [
12 |     "export_model",
13 |     "import_model",
14 |     "Word2VecLight"
15 |     ]
16 | 


--------------------------------------------------------------------------------
/spec2vec/serialization/model_exporting.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from copy import deepcopy
 4 | from typing import Union
 5 | import numpy as np
 6 | import scipy.sparse
 7 | from gensim.models import Word2Vec
 8 | 
 9 | 
10 | def export_model(model: Word2Vec,
11 |                  output_model_file: Union[str, os.PathLike],
12 |                  output_weights_file: Union[str, os.PathLike]):
13 |     """
14 |     Write a lightweight version of a :class:`~gensim.model.Word2Vec` model to disk. Such a model can be read to
15 |     calculate scores but is not capable of further training.
16 | 
17 |     Parameters
18 |     ----------
19 |     model:
20 |         :class:`~gensim.models.Word2Vec` trained model.
21 |     output_model_file:
22 |         A path of json file to save the model.
23 |     output_weights_file:
24 |         A path of `.npy` file to save the model's weights.
25 |     """
26 |     model = deepcopy(model)
27 |     keyedvectors = extract_keyedvectors(model)
28 |     weights = keyedvectors.pop("vectors")
29 |     keyedvectors["__weights_format"] = get_weights_format(weights)
30 | 
31 |     save_model(keyedvectors, output_model_file)
32 |     save_weights(weights, output_weights_file)
33 | 
34 | 
35 | def save_weights(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix],
36 |                  output_weights_file: Union[str, os.PathLike]):
37 |     """
38 |     Write model's weights to disk in `.npy` dense array format. If the weights are sparse, they are converted to dense
39 |     prior to saving.
40 |     """
41 |     if isinstance(weights, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
42 |         weights = weights.toarray()
43 | 
44 |     np.save(output_weights_file, weights, allow_pickle=False)
45 | 
46 | 
47 | def save_model(keyedvectors: dict, output_model_file: Union[str, os.PathLike]):
48 |     """Write model's metadata to disk in json format."""
49 |     with open(output_model_file, "w", encoding="utf-8") as f:
50 |         json.dump(keyedvectors, f)
51 | 
52 | 
53 | def get_weights_format(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]) -> str:
54 |     """
55 |     Get the array format of the model's weights.
56 | 
57 |     Parameters
58 |     ----------
59 |     weights:
60 |         Model's weights.
61 | 
62 |     Returns
63 |     -------
64 |     weights_format:
65 |         Format of the model's weights.
66 |     """
67 |     if isinstance(weights, np.ndarray):
68 |         return "np.ndarray"
69 |     if isinstance(weights, scipy.sparse.csr_matrix):
70 |         return "csr_matrix"
71 |     if isinstance(weights, scipy.sparse.csc_matrix):
72 |         return "csc_matrix"
73 |     raise NotImplementedError("The model's weights format is not supported.")
74 | 
75 | 
76 | def extract_keyedvectors(model: Word2Vec) -> dict:
77 |     """
78 |     Extract :class:`~gensim.models.KeyedVectors` object from the model, convert it to a dictionary and
79 |     remove redundant keys.
80 | 
81 |     Parameters
82 |     ----------
83 |     model:
84 |         :class:`~gensim.models.Word2Vec` trained model.
85 | 
86 |     Returns
87 |     -------
88 |     keyedvectors:
89 |         Dictionary representation of :class:`~gensim.models.KeyedVectors` without redundant keys.
90 |     """
91 |     keyedvectors = model.wv.__dict__
92 |     keyedvectors.pop("vectors_lockf", None)
93 |     keyedvectors.pop("expandos", None)
94 |     return keyedvectors
95 | 


--------------------------------------------------------------------------------
/spec2vec/serialization/model_importing.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import Union
 4 | import numpy as np
 5 | import scipy.sparse
 6 | from gensim.models import KeyedVectors
 7 | 
 8 | 
 9 | class Word2VecLight:
10 |     """
11 |     A lightweight version of :class:`~gensim.models.Word2Vec`. The objects of this class follow the interface of the
12 |     original :class:`~gensim.models.Word2Vec` to the point necessary to calculate Spec2Vec scores. The model cannot be
13 |     used for further training.
14 |     """
15 | 
16 |     def __init__(self, model: dict, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
17 |         """
18 | 
19 |         Parameters
20 |         ----------
21 |         model:
22 |             A dictionary containing the model's metadata.
23 |         weights:
24 |             A numpy array or a scipy sparse matrix containing the model's weights.
25 |         """
26 |         self.wv: KeyedVectors = self._KeyedVectorsBuilder().from_dict(model).with_weights(weights).build()
27 | 
28 |     class _KeyedVectorsBuilder:
29 |         def __init__(self):
30 |             self.vector_size = None
31 |             self.weights = None
32 | 
33 |         def build(self) -> KeyedVectors:
34 |             keyed_vectors = KeyedVectors(self.vector_size)
35 |             keyed_vectors.__dict__ = self.__dict__
36 |             keyed_vectors.vectors = self.weights
37 |             return keyed_vectors
38 | 
39 |         def from_dict(self, dictionary: dict):
40 |             expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads",
41 |                              "index_to_key", "norms", "key_to_index", "__weights_format"}
42 |             if dictionary.keys() == expected_keys:
43 |                 self.__dict__ = dictionary
44 |             elif expected_keys.symmetric_difference(dictionary.keys()) == {"next_index"}:  # backward compatibility
45 |                 dictionary.pop("next_index")
46 |                 self.__dict__ = dictionary
47 |             else:
48 |                 raise ValueError("The keys of model's dictionary representation do not match the expected keys.")
49 |             return self
50 | 
51 |         def with_weights(self, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
52 |             self.weights = weights
53 |             return self
54 | 
55 | 
56 | def import_model(model_file, weights_file) -> Word2VecLight:
57 |     """
58 |     Read a lightweight version of a :class:`~gensim.models.Word2Vec` model from disk.
59 | 
60 |     Parameters
61 |     ----------
62 |     model_file:
63 |         A path of json file to load the model.
64 |     weights_file:
65 |         A path of `.npy` file to load the model's weights.
66 | 
67 |     Returns
68 |     -------
69 |     :class:`~spec2vec.serialization.model_importing.Word2VecLight` – a lightweight version of a
70 |     :class:`~gensim.models.Word2Vec`
71 |     """
72 |     with open(model_file, "r", encoding="utf-8") as f:
73 |         model: dict = json.load(f)
74 | 
75 |     weights = load_weights(weights_file, model["__weights_format"])
76 |     return Word2VecLight(model, weights)
77 | 
78 | 
79 | def load_weights(weights_file: Union[str, os.PathLike],
80 |                  weights_format: str) -> Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]:
81 |     weights: np.ndarray = np.load(weights_file, allow_pickle=False)
82 | 
83 |     weights_array_builder = {"csr_matrix": scipy.sparse.csr_matrix,
84 |                             "csc_matrix": scipy.sparse.csc_matrix,
85 |                             "np.ndarray": lambda x: x}
86 |     weights = weights_array_builder[weights_format](weights)
87 | 
88 |     return weights
89 | 


--------------------------------------------------------------------------------
/spec2vec/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from gensim.models.callbacks import CallbackAny2Vec
 3 | 
 4 | 
 5 | class TrainingProgressLogger(CallbackAny2Vec):
 6 |     """Callback to log training progress."""
 7 | 
 8 |     def __init__(self, num_of_epochs: int):
 9 |         """
10 | 
11 |         Parameters
12 |         ----------
13 |         num_of_epochs:
14 |             Total number of training epochs.
15 |         """
16 |         self.epoch = 0
17 |         self.num_of_epochs = num_of_epochs
18 |         self.loss = 0
19 | 
20 |     def on_epoch_end(self, model):
21 |         """Return progress of model training"""
22 |         loss = model.get_latest_training_loss()
23 | 
24 |         print('\r',
25 |               ' Epoch ' + str(self.epoch+1) + ' of ' + str(self.num_of_epochs) + '.',
26 |               end="")
27 |         print(f'Change in loss after epoch {self.epoch + 1}: {loss - self.loss}')
28 |         self.epoch += 1
29 |         self.loss = loss
30 | 
31 | 
32 | class ModelSaver(CallbackAny2Vec):
33 |     """Callback to save model during training (when specified)."""
34 | 
35 |     def __init__(self, num_of_epochs: int, iterations: List, filename: str):
36 |         """
37 | 
38 |         Parameters
39 |         ----------
40 |         num_of_epochs:
41 |             Total number of training epochs.
42 |         iterations:
43 |             Number of total iterations or list of iterations at which to save the
44 |             model.
45 |         filename:
46 |             Filename to save model.
47 |         """
48 |         self.epoch = 0
49 |         self.num_of_epochs = num_of_epochs
50 |         self.iterations = iterations
51 |         self.filename = filename
52 | 
53 |     def on_epoch_end(self, model):
54 |         """Allow saving model during training when specified in iterations."""
55 |         self.epoch += 1
56 | 
57 |         if self.filename and self.epoch in self.iterations:
58 |             if self.epoch < self.num_of_epochs:
59 |                 filename = f"{self.filename.split('.model')[0]}_iter_{self.epoch}.model"
60 |             else:
61 |                 filename = self.filename
62 |             print("Saving model with name:", filename)
63 |             model.save(filename)
64 | 


--------------------------------------------------------------------------------
/spec2vec/vector_operations.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Union
  3 | import numba
  4 | import numpy as np
  5 | from gensim.models.basemodel import BaseTopicModel
  6 | from spec2vec.Document import Document
  7 | 
  8 | 
  9 | logger = logging.getLogger("spec2vec")
 10 | 
 11 | 
 12 | def calc_vector(model: BaseTopicModel, document: Document,
 13 |                 intensity_weighting_power: Union[float, int] = 0,
 14 |                 allowed_missing_percentage: Union[float, int] = 10) -> np.ndarray:
 15 |     """Compute document vector as a (weighted) sum of individual word vectors.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     model
 20 |         Pretrained word2vec model to convert words into vectors.
 21 |     document
 22 |         Document containing document.words and document.weights.
 23 |     intensity_weighting_power
 24 |         Specify to what power weights should be raised. The default is 0, which
 25 |         means that no weighing will be done.
 26 |     allowed_missing_percentage:
 27 |         Set the maximum allowed percentage of the document that may be missing
 28 |         from the input model. This is measured as percentage of the weighted, missing
 29 |         words compared to all word vectors of the document. Default is 10, which
 30 |         means up to 10% missing words are allowed. If more words are missing from
 31 |         the model, an empty embedding will be returned (leading to similarities of 0)
 32 |         and a warning is raised.
 33 | 
 34 |     Returns
 35 |     -------
 36 |     vector
 37 |         Vector representing the input document in latent space. Will return None
 38 |         if the missing percentage of the document in the model is > allowed_missing_percentage.
 39 |     """
 40 |     assert max(document.weights) <= 1.0, "Weights are not normalized to unity as expected."
 41 |     assert 0 <= allowed_missing_percentage <= 100.0, "allowed_missing_percentage must be within [0,100]"
 42 | 
 43 |     def _check_model_coverage():
 44 |         """Return True if model covers enough of the document words."""
 45 |         if len(idx_not_in_model) > 0:
 46 |             weights_missing = np.array([document.weights[i] for i in idx_not_in_model])
 47 |             weights_missing_raised = np.power(weights_missing, intensity_weighting_power)
 48 |             missing_percentage = 100 * weights_missing_raised.sum() / (weights_raised.sum()
 49 |                                                                        + weights_missing_raised.sum())
 50 |             msg = (f"Found {len(idx_not_in_model)} word(s) missing in the model.",
 51 |                    f"Weighted missing percentage not covered by the given model is {missing_percentage:.2f}%.")
 52 |             logger.info(msg)
 53 | 
 54 |             if missing_percentage > allowed_missing_percentage:
 55 |                 msg = (f"Missing percentage ({missing_percentage:.2f}%) is above set maximum. An empty vector will be returned.",
 56 |                        "Consider retraining the used model or change the `allowed_missing_percentage`.")
 57 |                 logger.warning(msg)
 58 |                 return False
 59 |         return True
 60 | 
 61 |     idx_not_in_model = [i for i, x in enumerate(document.words) if x not in model.wv.key_to_index]
 62 |     if len(idx_not_in_model) == len(document.words):
 63 |         msg = ("Spectrum without peaks known by the used model. An empty vector will be returned.",
 64 |                "Consider retraining the used model or make sure decimal rounding is correct.")
 65 |         logger.warning(msg)
 66 |         return np.zeros(model.wv.vector_size)
 67 | 
 68 |     words_in_model = [x for i, x in enumerate(document.words) if i not in idx_not_in_model]
 69 |     weights_in_model = np.asarray([x for i, x in enumerate(document.weights)
 70 |                                    if i not in idx_not_in_model]).reshape(len(words_in_model), 1)
 71 | 
 72 |     word_vectors = model.wv[words_in_model]
 73 |     weights_raised = np.power(weights_in_model, intensity_weighting_power)
 74 | 
 75 |     if _check_model_coverage() is True:
 76 |         weights_raised_tiled = np.tile(weights_raised, (1, model.wv.vector_size))
 77 |         return np.sum(word_vectors * weights_raised_tiled, 0)
 78 |     return np.zeros(model.wv.vector_size)
 79 | 
 80 | 
 81 | @numba.njit
 82 | def cosine_similarity_matrix(vectors_1: np.ndarray, vectors_2: np.ndarray) -> np.ndarray:
 83 |     """Fast implementation of cosine similarity between two arrays of vectors.
 84 | 
 85 |     For example:
 86 | 
 87 |     .. code-block:: python
 88 | 
 89 |         import numpy as np
 90 |         from spec2vec.vector_operations import cosine_similarity_matrix
 91 | 
 92 |         vectors_1 = np.array([[1, 1, 0, 0],
 93 |                               [1, 0, 1, 1]])
 94 |         vectors_2 = np.array([[0, 1, 1, 0],
 95 |                               [0, 0, 1, 1]])
 96 |         similarity_matrix = cosine_similarity_matrix(vectors_1, vectors_2)
 97 | 
 98 | 
 99 |     Parameters
100 |     ----------
101 |     vectors_1
102 |         Numpy array of vectors. vectors_1.shape[0] is number of vectors, vectors_1.shape[1]
103 |         is vector dimension.
104 |     vectors_2
105 |         Numpy array of vectors. vectors_2.shape[0] is number of vectors, vectors_2.shape[1]
106 |         is vector dimension.
107 |     """
108 |     assert vectors_1.shape[1] == vectors_2.shape[1], "Input vectors must have same shape."
109 |     vectors_1 = vectors_1.astype(np.float64)  # Numba dot only accepts float or complex arrays
110 |     vectors_2 = vectors_2.astype(np.float64)
111 |     norm_1 = np.sqrt(np.sum(vectors_1**2, axis=1))
112 |     norm_2 = np.sqrt(np.sum(vectors_2**2, axis=1))
113 |     for i in range(vectors_1.shape[0]):
114 |         vectors_1[i] = vectors_1[i] / norm_1[i]
115 |     for i in range(vectors_2.shape[0]):
116 |         vectors_2[i] = vectors_2[i] / norm_2[i]
117 |     return np.dot(vectors_1, vectors_2.T)
118 | 
119 | 
120 | @numba.njit
121 | def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> np.float64:
122 |     """Calculate cosine similarity between two input vectors.
123 | 
124 |     For example:
125 | 
126 |     .. testcode::
127 | 
128 |         import numpy as np
129 |         from spec2vec.vector_operations import cosine_similarity
130 | 
131 |         vector1 = np.array([1, 1, 0, 0])
132 |         vector2 = np.array([1, 1, 1, 1])
133 |         print("Cosine similarity: {:.3f}".format(cosine_similarity(vector1, vector2)))
134 | 
135 |     Should output
136 | 
137 |     .. testoutput::
138 | 
139 |         Cosine similarity: 0.707
140 | 
141 |     Parameters
142 |     ----------
143 |     vector1
144 |         Input vector. Can be array of integers or floats.
145 |     vector2
146 |         Input vector. Can be array of integers or floats.
147 |     """
148 |     assert vector1.shape[0] == vector2.shape[0], "Input vector must have same shape."
149 |     prod12 = 0
150 |     prod11 = 0
151 |     prod22 = 0
152 |     for i in range(vector1.shape[0]):
153 |         prod12 += vector1[i] * vector2[i]
154 |         prod11 += vector1[i] * vector1[i]
155 |         prod22 += vector2[i] * vector2[i]
156 |     cosine_score = 0
157 |     if prod11 != 0 and prod22 != 0:
158 |         cosine_score = prod12 / np.sqrt(prod11 * prod22)
159 |     return np.float64(cosine_score)
160 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import pytest
3 | 
4 | 
5 | @pytest.fixture(scope="module")
6 | def test_dir(request):
7 |     """Return the directory of the currently running test script."""
8 |     return Path(request.fspath).parent
9 | 


--------------------------------------------------------------------------------
/tests/data/weights.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iomega/spec2vec/36553f0e1df589dc02fcb6945fe440ccc2769c69/tests/data/weights.npy


--------------------------------------------------------------------------------
/tests/test_document.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spec2vec.Document import Document
 3 | 
 4 | 
 5 | def test_document_init():
 6 |     obj = "asdasd"
 7 |     document = Document(obj=obj)
 8 |     assert len(document) == 0
 9 | 
10 | 
11 | def test_document_raises_stop_iteration():
12 |     obj = "asdasd"
13 |     document = Document(obj=obj)
14 |     with pytest.raises(StopIteration):
15 |         next(document)
16 | 


--------------------------------------------------------------------------------
/tests/test_logging.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import os
 4 | from spec2vec.logging_functions import (add_logging_to_file,
 5 |                                         reset_spec2vec_logger,
 6 |                                         set_spec2vec_logger_level)
 7 | 
 8 | 
 9 | def test_initial_logging(caplog, capsys):
10 |     """Test logging functionality."""
11 |     reset_spec2vec_logger()
12 |     logger = logging.getLogger("spec2vec")
13 |     logger.info("info test")
14 |     logger.warning("warning test")
15 |     assert logger.name == "spec2vec", "Expected different logger name"
16 |     assert logger.getEffectiveLevel() == 30, "Expected different logging level"
17 |     assert "info test" not in caplog.text, "Info log should not be shown."
18 |     assert "warning test" in caplog.text, "Warning log should have been shown."
19 |     assert "warning test" in capsys.readouterr().out, \
20 |         "Warning log should have been shown to stderr/stdout."
21 |     reset_spec2vec_logger()
22 | 
23 | 
24 | def test_set_and_reset_spec2vec_logger_level(caplog):
25 |     """Test logging functionality."""
26 |     logger = logging.getLogger("spec2vec")
27 |     assert logger.getEffectiveLevel() == 30, "Expected different logging level"
28 | 
29 |     set_spec2vec_logger_level("INFO")
30 |     logger.debug("debug test")
31 |     logger.info("info test")
32 | 
33 |     assert logger.name == "spec2vec", "Expected different logger name"
34 |     assert logger.getEffectiveLevel() == 20, "Expected different logging level"
35 |     assert "debug test" not in caplog.text, "Debug log should not be shown."
36 |     assert "info test" in caplog.text, "Info log should have been shown."
37 | 
38 |     reset_spec2vec_logger()
39 |     assert logger.getEffectiveLevel() == 30, "Expected different logging level"
40 |     reset_spec2vec_logger()
41 | 
42 | 
43 | def test_add_logging_to_file(tmp_path, caplog, capsys):
44 |     """Test writing logs to file."""
45 |     reset_spec2vec_logger()
46 |     set_spec2vec_logger_level("INFO")
47 |     filename = os.path.join(tmp_path, "test.log")
48 |     add_logging_to_file(filename)
49 |     logger = logging.getLogger("spec2vec")
50 |     logger.info("test message no.1")
51 | 
52 |     expected_log_entry = "test message no.1"
53 |     # Test streamed logs
54 |     assert expected_log_entry in caplog.text, "Expected different log message."
55 |     assert expected_log_entry in capsys.readouterr().out, \
56 |         "Expected different log message in output (stdout/stderr)."
57 | 
58 |     # Test log file
59 |     expected_log_entry = "INFO:spec2vec:test_logging:test message no.1"
60 |     assert len(logger.handlers) == 2, "Expected two Handler"
61 |     assert os.path.isfile(filename), "Log file not found."
62 |     with open(filename, "r", encoding="utf-8") as file:
63 |         logs = file.read()
64 |     assert expected_log_entry in logs, "Expected different log file content"
65 |     reset_spec2vec_logger()
66 | 
67 | 
68 | def test_add_logging_to_file_only_file(tmp_path, capsys):
69 |     """Test writing logs to file."""
70 |     reset_spec2vec_logger()
71 |     set_spec2vec_logger_level("INFO")
72 |     filename = os.path.join(tmp_path, "test.log")
73 |     add_logging_to_file(filename, remove_stream_handlers=True)
74 |     logger = logging.getLogger("spec2vec")
75 |     logger.info("test message no.1")
76 | 
77 |     # Test streamed logs
78 |     not_expected_log_entry = "test message no.1"
79 |     assert len(logger.handlers) == 1, "Expected only one Handler"
80 |     assert not_expected_log_entry not in capsys.readouterr().out, "Did not expect log message"
81 | 
82 |     # Test log file
83 |     expected_log_entry = "INFO:spec2vec:test_logging:test message no.1"
84 |     assert os.path.isfile(filename), "Log file not found."
85 |     with open(filename, "r", encoding="utf-8") as file:
86 |         logs = file.read()
87 |     assert expected_log_entry in logs, "Expected different log file content"
88 |     reset_spec2vec_logger()
89 | 


--------------------------------------------------------------------------------
/tests/test_model_building.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gensim
 3 | import numpy as np
 4 | import pytest
 5 | from matchms import Spectrum
 6 | from spec2vec import SpectrumDocument
 7 | from spec2vec.model_building import (set_learning_rate_decay,
 8 |                                      train_new_word2vec_model)
 9 | 
10 | 
11 | def test_set_learning_rate_decay():
12 |     """Test if correct alpha and min_alpha are calculated."""
13 |     alpha, min_alpha = set_learning_rate_decay(0.5, 0.05, 8)
14 |     assert alpha == 0.5, "Expected different alpha."
15 |     assert min_alpha == 0.5 - 8 * 0.05, "Expected different min_alpha"
16 | 
17 | 
18 | def test_set_learning_rate_decay_rate_too_high():
19 |     """Test if correct alpha and min_alpha are calculated if rate is too high."""
20 |     alpha, min_alpha = set_learning_rate_decay(0.5, 0.05, 20)
21 |     assert alpha == 0.5, "Expected different alpha."
22 |     assert min_alpha == 0.0, "Expected different min_alpha"
23 | 
24 | 
25 | def test_train_new_word2vec_model():
26 |     """Test training of a dummy model."""
27 |     # Create fake corpus
28 |     documents = []
29 |     for i in range(100):
30 |         spectrum = Spectrum(mz=np.linspace(i, 9+i, 10),
31 |                             intensities=np.ones((10)).astype("float"),
32 |                             metadata={})
33 |         documents.append(SpectrumDocument(spectrum, n_decimals=1))
34 |     model = train_new_word2vec_model(documents, iterations=20, vector_size=20,
35 |                                      progress_logger=False)
36 |     assert model.sg == 0, "Expected different default value."
37 |     assert model.negative == 5, "Expected different default value."
38 |     assert model.window == 500, "Expected different default value."
39 |     assert model.alpha == 0.025, "Expected different default value."
40 |     assert model.min_alpha == 0.02, "Expected different default value."
41 |     assert model.epochs == 20, "Expected differnt number of epochs."
42 |     assert model.wv.vector_size == 20, "Expected differnt vector size."
43 |     assert len(model.wv) == 109, "Expected different number of words in vocab."
44 |     assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
45 | 
46 | 
47 | def test_train_new_word2vec_model_with_logger_and_saving(tmp_path):
48 |     """Test training of a dummy model and save it."""
49 |     # Create fake corpus
50 |     documents = []
51 |     for i in range(100):
52 |         spectrum = Spectrum(mz=np.linspace(i, 9+i, 10),
53 |                             intensities=np.ones((10)).astype("float"),
54 |                             metadata={})
55 |         documents.append(SpectrumDocument(spectrum, n_decimals=1))
56 |     # Train model and write to file
57 |     filename = os.path.join(tmp_path, "test.model")
58 |     model = train_new_word2vec_model(documents, iterations=20, filename=filename,
59 |                                      vector_size=20, progress_logger=True)
60 | 
61 |     # Test if file exists
62 |     assert os.path.isfile(filename), "Could not find saved model file."
63 | 
64 |     # Test if saved model seems to be correct
65 |     model = gensim.models.Word2Vec.load(filename)
66 |     assert model.sg == 0, "Expected different default value."
67 |     assert model.negative == 5, "Expected different default value."
68 |     assert model.window == 500, "Expected different default value."
69 |     assert model.alpha == 0.025, "Expected different default value."
70 |     assert model.min_alpha == 0.02, "Expected different default value."
71 |     assert model.epochs == 20, "Expected differnt number of epochs."
72 |     assert model.wv.vector_size == 20, "Expected differnt vector size."
73 |     assert len(model.wv) == 109, "Expected different number of words in vocab."
74 |     assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
75 | 
76 | 
77 | def test_train_new_word2vec_model_wrong_entry():
78 |     """Test training of a dummy model with not-accepted gensim argument entry."""
79 |     # Create fake corpus
80 |     documents = []
81 |     for i in range(10):
82 |         spectrum = Spectrum(mz=np.linspace(i, 9+i, 10),
83 |                             intensities=np.ones((10)).astype("float"),
84 |                             metadata={})
85 |         documents.append(SpectrumDocument(spectrum, n_decimals=1))
86 | 
87 |     with pytest.raises(AssertionError) as msg:
88 |         _ = train_new_word2vec_model(documents, iterations=20, alpha=0.01,
89 |                                      progress_logger=False)
90 | 
91 |     expected_message_part = "Expect 'learning_rate_initial' instead of 'alpha'."
92 |     assert expected_message_part in str(msg.value), "Expected particular error message."
93 | 


--------------------------------------------------------------------------------
/tests/test_model_serialization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from unittest.mock import MagicMock, patch
  3 | import numpy as np
  4 | import pytest
  5 | from gensim.models import Word2Vec
  6 | from matchms import Spectrum, calculate_scores
  7 | from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
  8 | from spec2vec import Spec2Vec
  9 | from spec2vec.serialization import Word2VecLight, export_model, import_model
 10 | 
 11 | 
 12 | @pytest.fixture(params=["numpy", "scipy_csr", "scipy_csc"])
 13 | def model(request, test_dir):
 14 |     model_file = os.path.join(test_dir, "..", "integration-tests", "test_user_workflow_spec2vec.model")
 15 |     model = Word2Vec.load(model_file)
 16 | 
 17 |     if request.param in ["scipy_csc", "scipy_csr"]:
 18 |         scipy_matrix_builder = {"scipy_csr": csr_matrix, "scipy_csc": csc_matrix}
 19 |         model.wv.__numpys, model.wv.__ignoreds = [], []
 20 |         model.wv.__scipys = ["vectors"]  # pylint:disable=protected-access
 21 |         model.wv.vectors = scipy_matrix_builder[request.param](model.wv.vectors)
 22 |     return model
 23 | 
 24 | 
 25 | def write_read_model(model, tmp_path):
 26 |     model_file = tmp_path / "model.json"
 27 |     weights_file = tmp_path / "weights.npy"
 28 |     export_model(model, model_file, weights_file)
 29 | 
 30 |     model = import_model(model_file, weights_file)
 31 |     return model
 32 | 
 33 | 
 34 | def test_write_model_to_disk(model, tmp_path):
 35 |     model_file = tmp_path / "model.json"
 36 |     weights_file = tmp_path / "weights.npy"
 37 |     export_model(model, model_file, weights_file)
 38 | 
 39 |     assert os.path.isfile(model_file)
 40 |     assert os.path.isfile(weights_file)
 41 | 
 42 | 
 43 | def test_read_model_from_disk(test_dir):
 44 |     model_file = os.path.join(test_dir, "data", "model.json")
 45 |     weights_file = os.path.join(test_dir, "data", "weights.npy")
 46 |     model = import_model(model_file, weights_file)
 47 | 
 48 |     assert isinstance(model, Word2VecLight)
 49 | 
 50 | 
 51 | def test_model_metadata_integrity(model, tmp_path):
 52 |     imported_model = write_read_model(model, tmp_path)
 53 | 
 54 |     assert imported_model.wv.vector_size == model.wv.vector_size
 55 |     assert imported_model.wv.key_to_index == model.wv.key_to_index
 56 |     assert imported_model.wv.index_to_key == model.wv.index_to_key
 57 |     assert imported_model.wv.__scipys == model.wv.__scipys  # pylint:disable=protected-access
 58 |     assert imported_model.wv.__numpys == model.wv.__numpys  # pylint:disable=protected-access
 59 |     assert imported_model.wv.__ignoreds == model.wv.__ignoreds  # pylint:disable=protected-access
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("model", ["numpy"], indirect=True)
 63 | def test_dense_weights_integrity(model, tmp_path):
 64 |     imported_model = write_read_model(model, tmp_path)
 65 | 
 66 |     assert (imported_model.wv.vectors == model.wv.vectors).all()
 67 | 
 68 | 
 69 | @pytest.mark.parametrize("model", ["scipy_csr", "scipy_csc"], indirect=True)
 70 | def test_sparse_weights_integrity(model, tmp_path):
 71 |     imported_model = write_read_model(model, tmp_path)
 72 | 
 73 |     assert (imported_model.wv.vectors.toarray() == model.wv.vectors.toarray()).all()
 74 | 
 75 | 
 76 | @patch("json.load", MagicMock(return_value={"unexpected_key": "value", "__weights_format": "np.ndarray"}))
 77 | def test_reading_model_with_wrong_keys_fails(test_dir):
 78 |     model_file = os.path.join(test_dir, "data", "model.json")
 79 |     weights_file = os.path.join(test_dir, "data", "weights.npy")
 80 | 
 81 |     with pytest.raises(ValueError) as error:
 82 |         import_model(model_file, weights_file)
 83 | 
 84 |     assert str(error.value) == "The keys of model's dictionary representation do not match the expected keys."
 85 | 
 86 | 
 87 | def test_writing_model_with_wrong_weights_format_fails(model):
 88 |     model.wv.vectors = coo_matrix(model.wv.vectors)
 89 | 
 90 |     with pytest.raises(NotImplementedError) as error:
 91 |         export_model(model, "model.json", "weights.npy")
 92 | 
 93 |     assert str(error.value) == "The model's weights format is not supported."
 94 | 
 95 | 
 96 | @pytest.mark.parametrize("model", ["numpy"], indirect=True)  # calculate_scores supports only numpy arrays
 97 | def test_reloaded_model_computes_scores(model, tmp_path):
 98 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 99 |                           intensities=np.array([0.7, 0.2, 0.1]),
100 |                           metadata={'id': 'spectrum1'})
101 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
102 |                           intensities=np.array([0.4, 0.2, 0.1]),
103 |                           metadata={'id': 'spectrum2'})
104 |     spectrum_3 = Spectrum(mz=np.array([110, 140, 180.]),
105 |                           intensities=np.array([0.4, 0.3, 0.1]),
106 |                           metadata={'id': 'spectrum3'})
107 | 
108 |     queries = [spectrum_1, spectrum_2]
109 |     references = [spectrum_1, spectrum_2, spectrum_3]
110 | 
111 |     reloaded_model = write_read_model(model, tmp_path)
112 |     spec2vec_reloaded = Spec2Vec(reloaded_model, intensity_weighting_power=0.5)
113 |     spec2vec = Spec2Vec(model, intensity_weighting_power=0.5)
114 | 
115 |     scores = list(calculate_scores(references, queries, spec2vec))
116 |     scores_reloaded = list(calculate_scores(references, queries, spec2vec_reloaded))
117 | 
118 |     assert scores == scores_reloaded
119 | 


--------------------------------------------------------------------------------
/tests/test_spec2vec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gensim
  3 | import numpy as np
  4 | import pytest
  5 | from matchms import Spectrum
  6 | from spec2vec import Spec2Vec, SpectrumDocument
  7 | 
  8 | 
  9 | def test_spec2vec_pair_method_spectrum_entry():
 10 |     """Test if pair of two Spectrums is handled correctly"""
 11 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 12 |                           intensities=np.array([0.7, 0.2, 0.1]),
 13 |                           metadata={'id': 'spectrum1'})
 14 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
 15 |                           intensities=np.array([0.4, 0.2, 0.1]),
 16 |                           metadata={'id': 'spectrum2'})
 17 | 
 18 |     model = load_test_model()
 19 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
 20 |     score01 = spec2vec.pair(spectrum_1, spectrum_2)
 21 |     assert score01 == pytest.approx(0.9936808, 1e-6)
 22 |     score11 = spec2vec.pair(spectrum_2, spectrum_2)
 23 |     assert score11 == pytest.approx(1.0, 1e-9)
 24 | 
 25 | 
 26 | def test_spec2vec_pair_method_spectrumdocument_entry():
 27 |     """Test if pair of two SpectrumDocuments is handled correctly"""
 28 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 29 |                           intensities=np.array([0.7, 0.2, 0.1]),
 30 |                           metadata={'id': 'spectrum1'})
 31 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
 32 |                           intensities=np.array([0.4, 0.2, 0.1]),
 33 |                           metadata={'id': 'spectrum2'})
 34 | 
 35 |     documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]]
 36 |     model = load_test_model()
 37 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
 38 |     score01 = spec2vec.pair(documents[0], documents[1])
 39 |     assert score01 == pytest.approx(0.9936808, 1e-6)
 40 |     score11 = spec2vec.pair(documents[1], documents[1])
 41 |     assert score11 == pytest.approx(1.0, 1e-9)
 42 | 
 43 | 
 44 | def test_spec2vec_pair_method_none_entry():
 45 |     """Test if wrong input data raises expected exception"""
 46 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 47 |                           intensities=np.array([0.7, 0.2, 0.1]),
 48 |                           metadata={'id': 'spectrum1'})
 49 |     spectrum_2 = None
 50 |     model = load_test_model()
 51 |     spec2vec = Spec2Vec(model=model)
 52 |     with pytest.raises(ValueError) as msg:
 53 |         _ = spec2vec.pair(spectrum_1, spectrum_2)
 54 | 
 55 |     expected_msg = "Expected input type to be Spectrum or SpectrumDocument"
 56 |     assert expected_msg in str(msg), "Expected different exception"
 57 | 
 58 | 
 59 | def test_spec2vec_pair_method_wrong_spectrumdocument_entry():
 60 |     """Test if SpectrumDocuments with different decimal rounding is handled correctly"""
 61 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 62 |                           intensities=np.array([0.7, 0.2, 0.1]),
 63 |                           metadata={'id': 'spectrum1'})
 64 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
 65 |                           intensities=np.array([0.4, 0.2, 0.1]),
 66 |                           metadata={'id': 'spectrum2'})
 67 | 
 68 |     documents = [SpectrumDocument(s, n_decimals=2) for s in [spectrum_1, spectrum_2]]
 69 |     model = load_test_model()
 70 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
 71 |     with pytest.raises(AssertionError) as msg:
 72 |         _ = spec2vec.pair(documents[0], documents[1])
 73 | 
 74 |     expected_msg = "Decimal rounding of input data does not agree with model vocabulary."
 75 |     assert expected_msg in str(msg), "Expected different exception"
 76 | 
 77 | 
 78 | @pytest.mark.parametrize("progress_bar", [True, False])
 79 | def test_spec2vec_matrix_method(progress_bar):
 80 |     """Test if matrix of 2x2 SpectrumDocuments is handled correctly.
 81 |     Run with and without progress bar.
 82 |     """
 83 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
 84 |                           intensities=np.array([0.7, 0.2, 0.1]),
 85 |                           metadata={'id': 'spectrum1'})
 86 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
 87 |                           intensities=np.array([0.4, 0.2, 0.1]),
 88 |                           metadata={'id': 'spectrum2'})
 89 | 
 90 |     documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]]
 91 |     model = load_test_model()
 92 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, progress_bar=progress_bar)
 93 |     scores = spec2vec.matrix(documents, documents)
 94 |     assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score."
 95 |     assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score."
 96 |     assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score."
 97 |     assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score."
 98 | 
 99 | 
100 | def test_spec2vec_matrix_method_symmetric_spectrum_entry():
101 |     """Test if matrix of 2x2 Spectrums is handled correctly.
102 |     Run with is_symmetric=True.
103 |     """
104 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
105 |                           intensities=np.array([0.7, 0.2, 0.1]),
106 |                           metadata={'id': 'spectrum1'})
107 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
108 |                           intensities=np.array([0.4, 0.2, 0.1]),
109 |                           metadata={'id': 'spectrum2'})
110 | 
111 |     spectrums = [spectrum_1, spectrum_2]
112 |     model = load_test_model()
113 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
114 |     scores = spec2vec.matrix(spectrums, spectrums, is_symmetric=True)
115 |     assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score."
116 |     assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score."
117 |     assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score."
118 |     assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score."
119 | 
120 | 
121 | def test_spec2vec_matrix_method_symmetric_spectrumdocument_entry():
122 |     """Test if matrix of 2x2 SpectrumDocuments is handled correctly.
123 |     Run with is_symmetric=True.
124 |     """
125 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
126 |                           intensities=np.array([0.7, 0.2, 0.1]),
127 |                           metadata={'id': 'spectrum1'})
128 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
129 |                           intensities=np.array([0.4, 0.2, 0.1]),
130 |                           metadata={'id': 'spectrum2'})
131 | 
132 |     documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]]
133 |     model = load_test_model()
134 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
135 |     scores = spec2vec.matrix(documents, documents, is_symmetric=True)
136 |     assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score."
137 |     assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score."
138 |     assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score."
139 |     assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score."
140 | 
141 | 
142 | def test_spec2vec_matrix_method_symmetric_wrong_entry():
143 |     """Test if matrix of 2x2 SpectrumDocuments is handled correctly.
144 |     Run with is_symmetric=True but non symmetric entries.
145 |     """
146 |     spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
147 |                           intensities=np.array([0.7, 0.2, 0.1]),
148 |                           metadata={'id': 'spectrum1'})
149 |     spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
150 |                           intensities=np.array([0.4, 0.2, 0.1]),
151 |                           metadata={'id': 'spectrum2'})
152 | 
153 |     documents1 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]]
154 |     documents2 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_2, spectrum_1]]
155 |     model = load_test_model()
156 |     spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
157 |     expected_msg = "Expected references to be equal to queries for is_symmetric=True"
158 |     with pytest.raises(AssertionError) as msg:
159 |         _ = spec2vec.matrix(documents1, documents2, is_symmetric=True)
160 |     assert expected_msg in str(msg), "Expected different exception message"
161 | 
162 | 
163 | def load_test_model():
164 |     """Load pretrained Word2Vec model."""
165 |     repository_root = os.path.join(os.path.dirname(__file__), "..")
166 |     model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model")
167 |     assert os.path.isfile(model_file), "Expected file not found."
168 |     return gensim.models.Word2Vec.load(model_file)
169 | 


--------------------------------------------------------------------------------
/tests/test_spectrum_document.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from matchms import Spectrum
  4 | from matchms.filtering import add_losses
  5 | from spec2vec import SpectrumDocument
  6 | 
  7 | 
  8 | def test_spectrum_document_init_n_decimals_default_value_no_losses():
  9 | 
 10 |     mz = np.array([10, 20, 30, 40], dtype="float")
 11 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
 12 |     metadata = dict(precursor_mz=100.0)
 13 |     spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
 14 |     spectrum_document = SpectrumDocument(spectrum)
 15 | 
 16 |     assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals"
 17 |     assert len(spectrum_document) == 4
 18 |     assert spectrum_document.words == [
 19 |         "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00"
 20 |     ]
 21 |     assert next(spectrum_document) == "peak@10.00"
 22 | 
 23 | 
 24 | def test_spectrum_document_init_n_decimals_1_no_losses():
 25 |     mz = np.array([10, 20, 30, 40], dtype="float")
 26 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
 27 |     metadata = dict(precursor_mz=100.0)
 28 |     spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
 29 |     spectrum_document = SpectrumDocument(spectrum, n_decimals=1)
 30 | 
 31 |     assert spectrum_document.n_decimals == 1
 32 |     assert len(spectrum_document) == 4
 33 |     assert spectrum_document.words == [
 34 |         "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0"
 35 |     ]
 36 |     assert next(spectrum_document) == "peak@10.0"
 37 | 
 38 | 
 39 | def test_spectrum_document_init_default_with_losses():
 40 |     """Use default n_decimal and add losses."""
 41 |     mz = np.array([10, 20, 30, 40], dtype="float")
 42 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
 43 |     metadata = dict(precursor_mz=100.0)
 44 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
 45 |     spectrum = add_losses(spectrum_in)
 46 |     spectrum_document = SpectrumDocument(spectrum)
 47 | 
 48 |     assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals"
 49 |     assert len(spectrum_document) == 8
 50 |     assert spectrum_document.words == [
 51 |         "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00",
 52 |         "loss@60.00", "loss@70.00", "loss@80.00", "loss@90.00"
 53 |     ]
 54 |     assert next(spectrum_document) == "peak@10.00"
 55 | 
 56 | 
 57 | def test_spectrum_document_init_n_decimals_1():
 58 |     """Use n_decimal=1 and add losses."""
 59 |     mz = np.array([10, 20, 30, 40], dtype="float")
 60 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
 61 |     metadata = dict(precursor_mz=100.0)
 62 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
 63 |     spectrum = add_losses(spectrum_in)
 64 |     spectrum_document = SpectrumDocument(spectrum, n_decimals=1)
 65 | 
 66 |     assert spectrum_document.n_decimals == 1
 67 |     assert len(spectrum_document) == 8
 68 |     assert spectrum_document.words == [
 69 |         "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0",
 70 |         "loss@60.0", "loss@70.0", "loss@80.0", "loss@90.0"
 71 |     ]
 72 |     assert next(spectrum_document) == "peak@10.0"
 73 | 
 74 | 
 75 | def test_spectrum_document_metadata_getter():
 76 |     """Test metadata getter"""
 77 |     mz = np.array([10, 20, 30, 40], dtype="float")
 78 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
 79 |     metadata = {"precursor_mz": 100.0,
 80 |                 "smiles": "testsmiles"}
 81 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
 82 |     spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2)
 83 | 
 84 |     assert spectrum_document.n_decimals == 2
 85 |     assert len(spectrum_document) == 4
 86 |     assert spectrum_document.metadata == metadata, "Expected different metadata"
 87 |     assert spectrum_document.get("smiles") == "testsmiles", "Expected different metadata"
 88 |     assert spectrum_document.words == [
 89 |         "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00"
 90 |     ]
 91 |     assert next(spectrum_document) == "peak@10.00"
 92 | 
 93 | 
 94 | def test_spectrum_document_metadata_getter_notallowed_key():
 95 |     """Test metadata getter with key that is also attribute"""
 96 |     mz = np.array([10], dtype="float")
 97 |     intensities = np.array([0], dtype="float")
 98 |     metadata = {"smiles": "testsmiles"}
 99 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
100 |     spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2)
101 | 
102 |     assert spectrum_document.n_decimals == 2
103 |     with pytest.raises(AssertionError) as msg:
104 |         spectrum_document.get("n_decimals")
105 | 
106 |     assert str(msg.value) == "Key cannot be attribute of SpectrumDocument class"
107 | 
108 | 
109 | def test_spectrum_document_peak_getter():
110 |     """Test peak getter"""
111 |     mz = np.array([10, 20, 30, 40], dtype="float")
112 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
113 |     metadata = {"precursor_mz": 100.0}
114 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
115 |     spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2)
116 | 
117 |     assert spectrum_document.words == [
118 |         "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00"
119 |     ]
120 |     assert np.all(spectrum_document.peaks.mz == mz), "Expected different peak m/z"
121 |     assert np.all(spectrum_document.peaks.intensities == intensities), "Expected different peaks"
122 | 
123 | 
124 | def test_spectrum_document_losses_getter():
125 |     """Test losses getter"""
126 |     mz = np.array([10, 20, 30, 40], dtype="float")
127 |     intensities = np.array([0, 0.01, 0.1, 1], dtype="float")
128 |     metadata = {"precursor_mz": 100.0}
129 |     spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
130 |     spectrum = add_losses(spectrum_in)
131 |     spectrum_document = SpectrumDocument(spectrum, n_decimals=2)
132 |     assert np.all(spectrum_document.losses.mz == np.array([60., 70., 80., 90.])), \
133 |         "Expected different losses"
134 |     assert np.all(spectrum_document.losses.intensities == intensities[::-1]), \
135 |         "Expected different losses"
136 | 


--------------------------------------------------------------------------------
/tests/test_vector_operations.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gensim
  3 | import numpy as np
  4 | import pytest
  5 | from matchms import Spectrum
  6 | from spec2vec import SpectrumDocument
  7 | from spec2vec.logging_functions import (reset_spec2vec_logger,
  8 |                                         set_spec2vec_logger_level)
  9 | from spec2vec.vector_operations import (calc_vector, cosine_similarity,
 10 |                                         cosine_similarity_matrix)
 11 | 
 12 | 
 13 | def test_calc_vector():
 14 |     """Test deriving a document vector using a pretrained network."""
 15 |     spectrum = Spectrum(mz=np.array([100, 150, 200, 250], dtype="float"),
 16 |                         intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"),
 17 |                         metadata={})
 18 | 
 19 |     document = SpectrumDocument(spectrum, n_decimals=1)
 20 |     model = import_pretrained_model()
 21 |     vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=1.0)
 22 |     expected_vector = np.array([0.08982063, -1.43037023, -0.17572929, -0.45750666, 0.44942236,
 23 |                                 1.35530729, -1.8305029, -0.36850534, -0.28393048, -0.34192028])
 24 |     assert np.all(vector == pytest.approx(expected_vector, 1e-5)), "Expected different document vector."
 25 | 
 26 | 
 27 | def test_calc_vector_missing_words_logging(caplog):
 28 |     """Test using a pretrained network and a missing words."""
 29 |     set_spec2vec_logger_level("INFO")
 30 |     spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"),
 31 |                         intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"),
 32 |                         metadata={})
 33 | 
 34 |     document = SpectrumDocument(spectrum, n_decimals=1)
 35 |     model = import_pretrained_model()
 36 |     assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model."
 37 | 
 38 |     calc_vector(model, document, intensity_weighting_power=0.5,
 39 |                 allowed_missing_percentage=100.0)
 40 | 
 41 |     expected_msg1 = "spec2vec:vector_operations.py"
 42 |     expected_msg2 = "Found 1 word(s) missing in the model."
 43 |     assert expected_msg1 in caplog.text, "Expected particular warning message."
 44 |     assert expected_msg2 in caplog.text, "Expected particular warning message."
 45 |     reset_spec2vec_logger()
 46 | 
 47 | 
 48 | def test_calc_vector_higher_than_allowed_missing_percentage(caplog):
 49 |     """Test using a pretrained network and a missing word percentage above allowed."""
 50 |     spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"),
 51 |                         intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"),
 52 |                         metadata={})
 53 | 
 54 |     document = SpectrumDocument(spectrum, n_decimals=1)
 55 |     model = import_pretrained_model()
 56 |     assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model."
 57 | 
 58 |     calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=16.0)
 59 | 
 60 |     expected_message_part = "Missing percentage (16.23%) is above set maximum."
 61 |     assert expected_message_part in caplog.text, "Expected particular warning message."
 62 | 
 63 | 
 64 | def test_calc_vector_within_allowed_missing_percentage():
 65 |     """Test using a pretrained network and a missing word percentage within allowed."""
 66 |     spectrum = Spectrum(mz=np.array([11.1, 100, 200, 250], dtype="float"),
 67 |                         intensities=np.array([0.1, 0.1, 0.1, 1.0], dtype="float"),
 68 |                         metadata={})
 69 | 
 70 |     document = SpectrumDocument(spectrum, n_decimals=1)
 71 |     model = import_pretrained_model()
 72 |     vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=17.0)
 73 |     expected_vector = np.array([0.12775915, -1.17673617, -0.14598507, -0.40189132, 0.36908966,
 74 |                                 1.11608575, -1.46774333, -0.31442554, -0.23168877, -0.29420064])
 75 |     assert document.words[0] not in model.wv.key_to_index, "Expected word to be missing from given model."
 76 |     assert np.all(vector == pytest.approx(expected_vector, 1e-5)), "Expected different document vector."
 77 | 
 78 | 
 79 | def test_calc_vector_no_words_in_model(caplog):
 80 |     """Test using a pretrained network which covers no 'word' of a given spectrum."""
 81 |     spectrum = Spectrum(mz=np.array([11.0, 100.8, 200.8], dtype="float"),
 82 |                         intensities=np.array([0.1, 0.2, 1.0], dtype="float"),
 83 |                         metadata={})
 84 | 
 85 |     document = SpectrumDocument(spectrum, n_decimals=1)
 86 |     model = import_pretrained_model()
 87 |     for i in range(3):
 88 |         assert document.words[i] not in model.wv.key_to_index, \
 89 |             "Expected word to be missing from given model."
 90 | 
 91 |     vector = calc_vector(model, document, intensity_weighting_power=0.5)
 92 | 
 93 |     expected_message_part = "An empty vector will be returned."
 94 |     assert expected_message_part in caplog.text, "Expected particular warning message."
 95 |     assert np.all(vector == np.zeros(10)), "Expected empty vector"
 96 | 
 97 | 
 98 | def import_pretrained_model():
 99 |     """Helper function to import pretrained word2vec model."""
100 |     repository_root = os.path.join(os.path.dirname(__file__), "..")
101 |     model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model")
102 |     return gensim.models.Word2Vec.load(model_file)
103 | 
104 | 
105 | @pytest.mark.parametrize("numba_compiled", [True, False])
106 | def test_cosine_similarity(numba_compiled):
107 |     """Test cosine similarity score calculation."""
108 |     vector1 = np.array([1, 1, 0, 0])
109 |     vector2 = np.array([1, 1, 1, 1])
110 | 
111 |     if numba_compiled:
112 |         score11 = cosine_similarity(vector1, vector1)
113 |         score12 = cosine_similarity(vector1, vector2)
114 |         score22 = cosine_similarity(vector2, vector2)
115 |     else:
116 |         score11 = cosine_similarity.py_func(vector1, vector1)
117 |         score12 = cosine_similarity.py_func(vector1, vector2)
118 |         score22 = cosine_similarity.py_func(vector2, vector2)
119 | 
120 |     assert score12 == 2 / np.sqrt(2 * 4), "Expected different score."
121 |     assert score11 == score22 == 1.0, "Expected different score."
122 | 
123 | 
124 | @pytest.mark.parametrize("numba_compiled", [True, False])
125 | def test_cosine_similarity_all_zeros(numba_compiled):
126 |     """Test cosine similarity score calculation with empty vector."""
127 |     vector1 = np.array([0, 0, 0, 0])
128 |     vector2 = np.array([1, 1, 1, 1])
129 | 
130 |     if numba_compiled:
131 |         score11 = cosine_similarity(vector1, vector1)
132 |         score12 = cosine_similarity(vector1, vector2)
133 |         score22 = cosine_similarity(vector2, vector2)
134 |     else:
135 |         score11 = cosine_similarity.py_func(vector1, vector1)
136 |         score12 = cosine_similarity.py_func(vector1, vector2)
137 |         score22 = cosine_similarity.py_func(vector2, vector2)
138 | 
139 |     assert score11 == score12 == 0.0, "Expected different score."
140 |     assert score22 == 1.0, "Expected different score."
141 | 
142 | 
143 | @pytest.mark.parametrize("numba_compiled", [True, False])
144 | def test_cosine_similarity_matrix(numba_compiled):
145 |     """Test cosine similarity scores calculation using int32 input.."""
146 |     vectors1 = np.array([[1, 1, 0, 0],
147 |                          [1, 0, 1, 1]], dtype=np.int32)
148 |     vectors2 = np.array([[0, 1, 1, 0],
149 |                          [0, 0, 1, 1]], dtype=np.int32)
150 | 
151 |     if numba_compiled:
152 |         scores = cosine_similarity_matrix(vectors1, vectors2)
153 |     else:
154 |         scores = cosine_similarity_matrix.py_func(vectors1, vectors2)
155 |     expected_scores = np.array([[0.5, 0.],
156 |                                 [0.40824829, 0.81649658]])
157 |     assert scores == pytest.approx(expected_scores, 1e-7), "Expected different scores."
158 | 
159 | 
160 | @pytest.mark.parametrize("numba_compiled", [True, False])
161 | def test_cosine_similarity_floats_matrix(numba_compiled):
162 |     """Test cosine similarity scores calculation using float64 input.."""
163 |     vectors1 = np.array([[1, 1, 0, 0],
164 |                          [1, 0, 1, 1]], dtype=np.float64)
165 |     vectors2 = np.array([[0, 1, 1, 0],
166 |                          [0, 0, 1, 1]], dtype=np.float64)
167 | 
168 |     if numba_compiled:
169 |         scores = cosine_similarity_matrix(vectors1, vectors2)
170 |     else:
171 |         scores = cosine_similarity_matrix.py_func(vectors1, vectors2)
172 |     expected_scores = np.array([[0.5, 0.],
173 |                                 [0.40824829, 0.81649658]])
174 |     assert scores == pytest.approx(expected_scores, 1e-7), "Expected different scores."
175 | 
176 | 
177 | @pytest.mark.parametrize("numba_compiled", [True, False])
178 | def test_cosine_similarity_matrix_input_cloned(numba_compiled):
179 |     """Test if score implementation clones the input correctly."""
180 |     vectors1 = np.array([[2, 2, 0, 0],
181 |                          [2, 0, 2, 2]])
182 |     vectors2 = np.array([[0, 2, 2, 0],
183 |                          [0, 0, 2, 2]])
184 | 
185 |     if numba_compiled:
186 |         cosine_similarity_matrix(vectors1, vectors2)
187 |     else:
188 |         cosine_similarity_matrix.py_func(vectors1, vectors2)
189 | 
190 |     assert np.all(vectors1 == np.array([[2, 2, 0, 0],
191 |                                         [2, 0, 2, 2]])), "Expected unchanged input."
192 | 
193 | 
194 | def test_differnt_input_vector_lengths():
195 |     """Test if correct error is raised."""
196 |     vector1 = np.array([0, 0, 0, 0])
197 |     vector2 = np.array([1, 1, 1, 1, 1])
198 | 
199 |     with pytest.raises(AssertionError) as msg:
200 |         _ = cosine_similarity(vector1, vector2)
201 | 
202 |     expected_message = "Input vector must have same shape."
203 |     assert expected_message == str(msg.value), "Expected particular error message."
204 | 


--------------------------------------------------------------------------------
/tests/test_version_string_consistency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from spec2vec import __version__ as expected_version
 4 | 
 5 | 
 6 | def test_version_string_consistency():
 7 |     """Check whether version in conda/meta.yaml is consistent with that in spec2vec.__version__"""
 8 | 
 9 |     repository_root = os.path.join(os.path.dirname(__file__), "..")
10 |     fixture = os.path.join(repository_root, "conda", "meta.yaml")
11 | 
12 |     with open(fixture, "r", encoding="utf-8") as f:
13 |         metayaml_contents = f.read()
14 | 
15 |     match = re.search(r"^{% set version = \"(?P<semver>.*)\" %}$", metayaml_contents, re.MULTILINE)
16 |     actual_version = match["semver"]
17 | 
18 |     assert expected_version == actual_version, "Expected version string used in conda/meta.yaml to be consistent with" \
19 |                                                " that in spec2vec.__version__"
20 | 


--------------------------------------------------------------------------------