├── .github
└── workflows
│ ├── python-package.yml
│ └── python-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── images
├── NN.drawio.png
├── avg_concat.png
└── full_concat.png
├── notebooks
├── example.ipynb
├── example.qmd
├── torchFastText_config.json
└── utils.py
├── pyproject.toml
├── renovate.json
├── requirements.txt
├── tests
├── __init__.py
├── test_all.py
├── test_fasttext_model_dataset.py
└── test_ngramtokenizer.py
└── torchFastText
├── __init__.py
├── datasets
├── __init__.py
├── dataset.py
└── tokenizer.py
├── explainability
├── __init__.py
└── visualisation.py
├── model
├── __init__.py
├── lightning_module.py
├── losses.py
└── pytorch_model.py
├── preprocess
├── __init__.py
└── preprocess.py
├── torchFastText.py
└── utilities
├── __init__.py
├── checkers.py
└── utils.py
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "main" ]
9 | pull_request:
10 | branches: [ "main" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: [ "3.10", "3.11", "3.12"]
20 |
21 | steps:
22 | - uses: actions/checkout@v4
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: Test with pytest
39 | run: |
40 | pytest
41 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package to PyPI when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | release-build:
20 | runs-on: ubuntu-latest
21 |
22 | steps:
23 | - uses: actions/checkout@v4
24 |
25 | - uses: actions/setup-python@v5
26 | with:
27 | python-version: "3.x"
28 |
29 | - name: Build release distributions
30 | run: |
31 | # NOTE: put your own distribution build steps here.
32 | python -m pip install build poetry
33 | poetry install
34 | poetry build
35 |
36 | - name: Upload distributions
37 | uses: actions/upload-artifact@v4
38 | with:
39 | name: release-dists
40 | path: dist/
41 |
42 | pypi-publish:
43 | runs-on: ubuntu-latest
44 | needs:
45 | - release-build
46 | permissions:
47 | # IMPORTANT: this permission is mandatory for trusted publishing
48 | id-token: write
49 |
50 | # Dedicated environments with protections for publishing are strongly recommended.
51 | # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
52 | environment:
53 | name: pypi
54 | # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
55 | # url: https://pypi.org/p/YOURPROJECT
56 | #
57 | # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
58 | # ALTERNATIVE: exactly, uncomment the following line instead:
59 | # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
60 |
61 | steps:
62 | - name: Retrieve release distributions
63 | uses: actions/download-artifact@v4
64 | with:
65 | name: release-dists
66 | path: dist/
67 |
68 | - name: Publish release distributions to PyPI
69 | uses: pypa/gh-action-pypi-publish@release/v1
70 | with:
71 | packages-dir: dist/
72 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # Lightning logs
163 | lightning_logs/
164 |
165 | # Training data
166 | data/training_data.txt
167 |
168 | # Docs
169 | docs/
170 | fastTextAttention.py
171 | *.pth
172 |
173 | # No lock file
174 | poetry.lock
175 |
176 | # vscode
177 | .vscode/
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 |
4 | repos:
5 | - repo: https://github.com/astral-sh/ruff-pre-commit
6 | # Ruff version.
7 | rev: v0.7.1
8 | hooks:
9 | # Run the linter.
10 | - id: ruff
11 | types_or: [ python]
12 | args: [ --fix ]
13 |
14 | # run sort imports
15 | - id: ruff
16 | args: ["check", "--select", "I", "--fix"]
17 |
18 | # Run the formatter.
19 | - id: ruff-format
20 | types_or: [ python ]
21 | - repo: https://github.com/kynan/nbstripout
22 | rev: 0.7.1
23 | hooks:
24 | - id: nbstripout
25 | - repo: https://github.com/pre-commit/pre-commit-hooks
26 | rev: v5.0.0
27 | hooks:
28 | - id: trailing-whitespace
29 | - id: end-of-file-fixer
30 | - id: check-yaml
31 | - id: check-added-large-files
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 InseeFrLab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # torchFastText : Efficient text classification with PyTorch
2 |
3 | A flexible PyTorch implementation of FastText for text classification with support for categorical features.
4 |
5 | ## Features
6 |
7 | - Supports text classification with FastText architecture
8 | - Handles both text and categorical features
9 | - N-gram tokenization
10 | - Flexible optimizer and scheduler options
11 | - GPU and CPU support
12 | - Model checkpointing and early stopping
13 | - Prediction and model explanation capabilities
14 |
15 | ## Installation
16 |
17 | ```bash
18 | pip install torchFastText
19 | ```
20 |
21 | ## Key Components
22 |
23 | - `build()`: Constructs the FastText model architecture
24 | - `train()`: Trains the model with built-in callbacks and logging
25 | - `predict()`: Generates class predictions
26 | - `predict_and_explain()`: Provides predictions with feature attributions
27 |
28 | ## Subpackages
29 |
30 | - `preprocess`: To preprocess text input, using `nltk` and `unidecode` libraries.
31 | - `explainability`: Simple methods to visualize feature attributions at word and letter levels, using `captum`library.
32 |
33 | Run `pip install torchFastText[preprocess]` or `pip install torchFastText[explainability]` to download these optional dependencies.
34 |
35 |
36 | ## Quick Start
37 |
38 | ```python
39 | from torchFastText import torchFastText
40 |
41 | # Initialize the model
42 | model = torchFastText(
43 | num_tokens=1000000,
44 | embedding_dim=100,
45 | min_count=5,
46 | min_n=3,
47 | max_n=6,
48 | len_word_ngrams=True,
49 | sparse=True
50 | )
51 |
52 | # Train the model
53 | model.train(
54 | X_train=train_data,
55 | y_train=train_labels,
56 | X_val=val_data,
57 | y_val=val_labels,
58 | num_epochs=10,
59 | batch_size=64,
60 | lr=4e-3
61 | )
62 | # Make predictions
63 | predictions = model.predict(test_data)
64 | ```
65 |
66 | where ```train_data``` is an array of size $(N,d)$, having the text in string format in the first column, the other columns containing tokenized categorical variables in `int` format.
67 |
68 | Please make sure `y_train` contains at least one time each possible label.
69 |
70 | ## Dependencies
71 |
72 | - PyTorch Lightning
73 | - NumPy
74 |
75 | ## Categorical features
76 |
77 | If any, each categorical feature $i$ is associated to an embedding matrix of size (number of unique values, embedding dimension) where the latter is a hyperparameter (`categorical_embedding_dims`) - chosen by the user - that can take three types of values:
78 |
79 | - `None`: same embedding dimension as the token embedding matrix. The categorical embeddings are then summed to the sentence-level embedding (which itself is an averaging of the token embeddings). See [Figure 1](#Default-architecture).
80 | - `int`: the categorical embeddings have all the same embedding dimensions, they are averaged and the resulting vector is concatenated to the sentence-level embedding (the last linear layer has an adapted input size). See [Figure 2](#avg-architecture).
81 | - `list`: the categorical embeddings have different embedding dimensions, all of them are concatenated without aggregation to the sentence-level embedding (the last linear layer has an adapted input size). See [Figure 3](#concat-architecture).
82 |
83 | Default is `None`.
84 |
85 |
86 | 
87 | *Figure 1: The 'sum' architecture*
88 |
89 |
90 | 
91 | *Figure 2: The 'average and concatenate' architecture*
92 |
93 |
94 | 
95 | *Figure 3: The 'concatenate all' architecture*
96 |
97 | ## Documentation
98 |
99 | For detailed usage and examples, please refer to the [example notebook](notebooks/example.ipynb). Use `pip install -r requirements.txt` after cloning the repository to install the necessary dependencies (some are specific to the notebook).
100 |
101 | ## Contributing
102 |
103 | Contributions are welcome! Please feel free to submit a Pull Request.
104 |
105 | ## License
106 |
107 | MIT
108 |
109 |
110 | ## References
111 |
112 | Inspired by the original FastText paper [1] and implementation.
113 |
114 | [1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
115 |
116 | ```
117 | @InProceedings{joulin2017bag,
118 | title={Bag of Tricks for Efficient Text Classification},
119 | author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
120 | booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
121 | month={April},
122 | year={2017},
123 | publisher={Association for Computational Linguistics},
124 | pages={427--431},
125 | }
126 | ```
127 |
--------------------------------------------------------------------------------
/images/NN.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InseeFrLab/torch-fastText/6b9aeb770033af311c2558799b07d14720f42f94/images/NN.drawio.png
--------------------------------------------------------------------------------
/images/avg_concat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InseeFrLab/torch-fastText/6b9aeb770033af311c2558799b07d14720f42f94/images/avg_concat.png
--------------------------------------------------------------------------------
/images/full_concat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InseeFrLab/torch-fastText/6b9aeb770033af311c2558799b07d14720f42f94/images/full_concat.png
--------------------------------------------------------------------------------
/notebooks/example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Exemple d’utilisation de la librairie `TorchFastText`\n",
8 | "\n",
9 | "*Warning*\n",
10 | "\n",
11 | "*`TorchFastText` library is still under active development. Have a\n",
12 | "regular look to for\n",
13 | "latest information.*\n",
14 | "\n",
15 | "To install package, you can run the following snippet"
16 | ],
17 | "id": "a01b1526-51df-4bf9-9fd4-11ef22ffcc79"
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Stable version\n",
26 | "pip install torchFastText \n",
27 | "# Development version\n",
28 | "# pip install !https://github.com/InseeFrLab/torch-fastText.git"
29 | ],
30 | "id": "a00a2856"
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "# Load and preprocess data\n",
37 | "\n",
38 | "In that guide, we propose to illustrate main package functionalities\n",
39 | "using that `DataFrame`:"
40 | ],
41 | "id": "b292ea76-57a1-4d4e-9bde-dcc9656dc447"
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import pandas as pd\n",
50 | "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet\")\n",
51 | "df = df.sample(10000)"
52 | ],
53 | "id": "37c042fe"
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Our goal will be to build multilabel classification for the `code`\n",
60 | "variable using `libelle` as feature.\n",
61 | "\n",
62 | "## Enriching our test dataset\n",
63 | "\n",
64 | "Unlike `Fasttext`, this package offers the possibility of having several\n",
65 | "feature columns of different types (string for the text column and\n",
66 | "additional variables in numeric form, for example). To illustrate that,\n",
67 | "we propose the following enrichment of the example dataset:"
68 | ],
69 | "id": "c399b4b0-a9cb-450e-9a5e-480e0e657b8e"
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 3,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "import pandas as pd\n",
78 | "import numpy as np\n",
79 | "from sklearn.model_selection import train_test_split\n",
80 | "from sklearn.preprocessing import LabelEncoder\n",
81 | "\n",
82 | "def categorize_surface(\n",
83 | " df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True\n",
84 | ") -> pd.DataFrame:\n",
85 | " \"\"\"\n",
86 | " Categorize the surface of the activity.\n",
87 | "\n",
88 | " Args:\n",
89 | " df (pd.DataFrame): DataFrame to categorize.\n",
90 | " surface_feature_name (str): Name of the surface feature.\n",
91 | " like_sirene_3 (bool): If True, categorize like Sirene 3.\n",
92 | "\n",
93 | " Returns:\n",
94 | " pd.DataFrame: DataFrame with a new column \"surf_cat\".\n",
95 | " \"\"\"\n",
96 | " df_copy = df.copy()\n",
97 | " df_copy[surface_feature_name] = df_copy[surface_feature_name].replace(\"nan\", np.nan)\n",
98 | " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)\n",
99 | " # Check surface feature exists\n",
100 | " if surface_feature_name not in df.columns:\n",
101 | " raise ValueError(f\"Surface feature {surface_feature_name} not found in DataFrame.\")\n",
102 | " # Check surface feature is a float variable\n",
103 | " if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):\n",
104 | " raise ValueError(f\"Surface feature {surface_feature_name} must be a float variable.\")\n",
105 | "\n",
106 | " if like_sirene_3:\n",
107 | " # Categorize the surface\n",
108 | " df_copy[\"surf_cat\"] = pd.cut(\n",
109 | " df_copy[surface_feature_name],\n",
110 | " bins=[0, 120, 400, 2500, np.inf],\n",
111 | " labels=[\"1\", \"2\", \"3\", \"4\"],\n",
112 | " ).astype(str)\n",
113 | " else:\n",
114 | " # Log transform the surface\n",
115 | " df_copy[\"surf_log\"] = np.log(df[surface_feature_name])\n",
116 | "\n",
117 | " # Categorize the surface\n",
118 | " df_copy[\"surf_cat\"] = pd.cut(\n",
119 | " df_copy.surf_log,\n",
120 | " bins=[0, 3, 4, 5, 12],\n",
121 | " labels=[\"1\", \"2\", \"3\", \"4\"],\n",
122 | " ).astype(str)\n",
123 | "\n",
124 | " df_copy[surface_feature_name] = df_copy[\"surf_cat\"].replace(\"nan\", \"0\")\n",
125 | " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)\n",
126 | " df_copy = df_copy.drop(columns=[\"surf_log\", \"surf_cat\"], errors=\"ignore\")\n",
127 | " return df_copy\n",
128 | "\n",
129 | "\n",
130 | "def clean_and_tokenize_df(\n",
131 | " df,\n",
132 | " categorical_features=[\"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\"],\n",
133 | " text_feature=\"libelle_processed\",\n",
134 | " label_col=\"apet_finale\",\n",
135 | "):\n",
136 | " df.fillna(\"nan\", inplace=True)\n",
137 | "\n",
138 | " df = df.rename(\n",
139 | " columns={\n",
140 | " \"evenement_type\": \"EVT\",\n",
141 | " \"cj\": \"CJ\",\n",
142 | " \"activ_nat_et\": \"NAT\",\n",
143 | " \"liasse_type\": \"TYP\",\n",
144 | " \"activ_surf_et\": \"SRF\",\n",
145 | " \"activ_perm_et\": \"CRT\",\n",
146 | " }\n",
147 | " )\n",
148 | "\n",
149 | " les = []\n",
150 | " for col in categorical_features:\n",
151 | " le = LabelEncoder()\n",
152 | " df[col] = le.fit_transform(df[col])\n",
153 | " les.append(le)\n",
154 | "\n",
155 | " df = categorize_surface(df, \"SRF\", like_sirene_3=True)\n",
156 | " df = df[[text_feature, \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\", label_col]]\n",
157 | "\n",
158 | " return df, les\n",
159 | "\n",
160 | "\n",
161 | "def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):\n",
162 | " # Get unique labels and their frequencies\n",
163 | " unique_labels, label_counts = np.unique(y, return_counts=True)\n",
164 | "\n",
165 | " # Separate rare and common labels\n",
166 | " rare_labels = unique_labels[label_counts == 1]\n",
167 | "\n",
168 | " # Create initial mask for rare labels to go into training set\n",
169 | " rare_label_mask = np.isin(y, rare_labels)\n",
170 | "\n",
171 | " # Separate data into rare and common label datasets\n",
172 | " X_rare = X[rare_label_mask]\n",
173 | " y_rare = y[rare_label_mask]\n",
174 | " X_common = X[~rare_label_mask]\n",
175 | " y_common = y[~rare_label_mask]\n",
176 | "\n",
177 | " # Split common labels stratified\n",
178 | " X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(\n",
179 | " X_common, y_common, test_size=test_size, stratify=y_common\n",
180 | " )\n",
181 | "\n",
182 | " # Combine rare labels with common labels split\n",
183 | " X_train = np.concatenate([X_rare, X_common_train])\n",
184 | " y_train = np.concatenate([y_rare, y_common_train])\n",
185 | " X_test = X_common_test\n",
186 | " y_test = y_common_test\n",
187 | "\n",
188 | " return X_train, X_test, y_train, y_test\n",
189 | "\n",
190 | "def add_libelles(\n",
191 | " df: pd.DataFrame,\n",
192 | " df_naf: pd.DataFrame,\n",
193 | " y: str,\n",
194 | " text_feature: str,\n",
195 | " textual_features: list,\n",
196 | " categorical_features: list,\n",
197 | "):\n",
198 | " missing_codes = set(df_naf[\"code\"])\n",
199 | " fake_obs = df_naf[df_naf[\"code\"].isin(missing_codes)]\n",
200 | " fake_obs[y] = fake_obs[\"code\"]\n",
201 | " fake_obs[text_feature] = fake_obs[[text_feature]].apply(\n",
202 | " lambda row: \" \".join(f\"[{col}] {val}\" for col, val in row.items() if val != \"\"), axis=1\n",
203 | " )\n",
204 | " df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])\n",
205 | "\n",
206 | " if textual_features is not None:\n",
207 | " for feature in textual_features:\n",
208 | " df[feature] = df[feature].fillna(value=\"\")\n",
209 | " if categorical_features is not None:\n",
210 | " for feature in categorical_features:\n",
211 | " df[feature] = df[feature].fillna(value=\"NaN\")\n",
212 | "\n",
213 | " print(f\"\\t*** {len(missing_codes)} codes have been added in the database...\\n\")\n",
214 | " return df"
215 | ],
216 | "id": "92402df7"
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 4,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "output_type": "stream",
225 | "name": "stdout",
226 | "text": [
227 | " *** 732 codes have been added in the database...\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "categorical_features = [\"evenement_type\", \"cj\", \"activ_nat_et\", \"liasse_type\", \"activ_surf_et\", \"activ_perm_et\"]\n",
233 | "text_feature = \"libelle\"\n",
234 | "y = \"apet_finale\"\n",
235 | "textual_features = None\n",
236 | "\n",
237 | "naf2008 = pd.read_csv(\"https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv\", sep=\";\")\n",
238 | "df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)"
239 | ],
240 | "id": "1fd02895"
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "## Preprocessing\n",
247 | "\n",
248 | "To reduce noise in text fields, we recommend pre-processing before\n",
249 | "training a model with our package. We assume this preprocessing is\n",
250 | "handled by the package user : this gives him the opportunity to control\n",
251 | "data cleansing.\n",
252 | "\n",
253 | "Here’s an example of the type of preprocessing that can be carried out\n",
254 | "before moving on to the modeling phase"
255 | ],
256 | "id": "67f4160d-0c98-4700-80f4-1ba454e6a2df"
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 5,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "from torchFastText.preprocess import clean_text_feature\n",
265 | "df[\"libelle_processed\"] = clean_text_feature(df[\"libelle\"])"
266 | ],
267 | "id": "61b0252e"
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "Right now, the model requires the label (variable y) to be a numerical\n",
274 | "variable. If the label variable is a text variable, we recommend using\n",
275 | "Scikit Learn’s\n",
276 | "[LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)\n",
277 | "to convert into a numeric variable. Using that function will give user\n",
278 | "the possibility to get back labels from the encoder after running\n",
279 | "predictions."
280 | ],
281 | "id": "acde2929-fe92-4107-8066-a5c8ac5d6428"
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 6,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "encoder = LabelEncoder()\n",
290 | "df[\"apet_finale\"] = encoder.fit_transform(df[\"apet_finale\"])"
291 | ],
292 | "id": "8c02a833"
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "The function `clean_and_tokenize_df` requires special `DataFrame`\n",
299 | "formatting:\n",
300 | "\n",
301 | "- First column contains the processed text (str)\n",
302 | "- Next ones contain the “encoded” categorical (discrete) variables in\n",
303 | " int format"
304 | ],
305 | "id": "25593e1a-1661-49e3-9734-272ec4745de1"
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 7,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "output_type": "stream",
314 | "name": "stderr",
315 | "text": [
316 | "/tmp/ipykernel_90631/2075507147.py:60: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
317 | " df.fillna(\"nan\", inplace=True)"
318 | ]
319 | }
320 | ],
321 | "source": [
322 | "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle_processed\")\n",
323 | "X = df[[\"libelle_processed\", \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n",
324 | "y = df[\"apet_finale\"].values"
325 | ],
326 | "id": "5fb5b0c7"
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "## Splitting in train-test sets\n",
333 | "\n",
334 | "As usual in a learning approach, you need to break down your data into\n",
335 | "learning and test/validation samples to obtain robust performance\n",
336 | "statistics.\n",
337 | "\n",
338 | "This work is the responsibility of the package’s users. Here’s an\n",
339 | "example of how to do it, using the\n",
340 | "[`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)\n",
341 | "function in `Scikit`."
342 | ],
343 | "id": "e70de831-dbc9-49be-b0c4-d70dd6479d03"
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 8,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "from sklearn.model_selection import train_test_split \n",
352 | "X_train, X_test, y_train, y_test = train_test_split(X, y)"
353 | ],
354 | "id": "b593fd75"
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "# Build the torch-fastText model (without training it)\n",
361 | "\n",
362 | "There are several ways to define and train a pytorch.fasttext model in\n",
363 | "this package.\n",
364 | "\n",
365 | "We first show how to initialize the model and then afterwars build it.\n",
366 | "\n",
367 | "`torchFastText` function accepts the following parameters:\n",
368 | "\n",
369 | "| Parameter | Meaning | Example Value |\n",
370 | "|---------------------|------------------------------------------|----------|\n",
371 | "| `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 |\n",
372 | "| `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 |\n",
373 | "| `sparse` | Use sparse embedding for fast computation (PyTorch) | False |\n",
374 | "| `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 |\n",
375 | "| `min_count` | Minimum occurrences of a word in the corpus to be included | 1 |\n",
376 | "| `min_n` | Minimum length of character n-grams | 3 |\n",
377 | "| `max_n` | Maximum length of character n-grams | 6 |\n",
378 | "| `len_word_ngrams` | Length of word n-grams | 3 |"
379 | ],
380 | "id": "8729c5f4-9038-4437-929b-fc500dc0db7a"
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 9,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "from torchFastText import torchFastText\n",
389 | "\n",
390 | "parameters = {\n",
391 | " \"num_tokens\": 100000,\n",
392 | " \"embedding_dim\": 50,\n",
393 | " \"sparse\": False,\n",
394 | " \"categorical_embedding_dims\": 10,\n",
395 | " \"min_count\": 1,\n",
396 | " \"min_n\": 3,\n",
397 | " \"max_n\": 6,\n",
398 | " \"len_word_ngrams\": 3,\n",
399 | "}\n",
400 | "\n",
401 | "parameters_train = {\n",
402 | " \"lr\": 0.004,\n",
403 | " \"num_epochs\": 1,\n",
404 | " \"batch_size\": 256,\n",
405 | " \"patience\": 3 \n",
406 | "}\n",
407 | "\n",
408 | "model = torchFastText(**parameters)"
409 | ],
410 | "id": "5879ca88"
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "`model` is then a special `torchFastText` object:"
417 | ],
418 | "id": "05f9d26b-f08f-41be-93e4-b55a2c86690c"
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 10,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "output_type": "display_data",
427 | "metadata": {},
428 | "data": {
429 | "text/plain": [
430 | "torchFastText.torchFastText.torchFastText"
431 | ]
432 | }
433 | }
434 | ],
435 | "source": [
436 | "type(model)"
437 | ],
438 | "id": "ebf5608b"
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "metadata": {},
443 | "source": [
444 | "As any `PyTorch` model, it accepts being save as a JSON for later on\n",
445 | "use:"
446 | ],
447 | "id": "dcbe8289-f506-48f9-b854-96f25974368f"
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 11,
452 | "metadata": {},
453 | "outputs": [],
454 | "source": [
455 | "model.to_json('torchFastText_config.json')\n",
456 | "# model = torchFastText.from_json('torchFastText_config.json')"
457 | ],
458 | "id": "6c3b2b85"
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {},
463 | "source": [
464 | "We can apply `build` to finally train our model. These are the\n",
465 | "parameters accepted by the `build` method\n",
466 | "\n",
467 | "| Parameter | Meaning | Example Value |\n",
468 | "|---------------------|------------------------------------------|----------|\n",
469 | "| `lr` | Learning rate | 0.004 |\n",
470 | "| `num_epochs` | Number of training epochs | 1 |\n",
471 | "| `batch_size` | Batch size for training | 256 |\n",
472 | "| `patience` | Early stopping patience (number of epochs without improvement) | 3 |\n",
473 | "\n",
474 | "We build the model using the training data. We have now access to the\n",
475 | "tokenizer, the PyTorch model as well as a PyTorch Lightning module ready\n",
476 | "to be trained. Note that Lightning is high-level framework for PyTorch\n",
477 | "that simplifies the process of training, validating, and deploying\n",
478 | "machine learning models."
479 | ],
480 | "id": "5f8b017f-66a1-413d-85e8-1981adf64823"
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 12,
485 | "metadata": {},
486 | "outputs": [
487 | {
488 | "output_type": "stream",
489 | "name": "stderr",
490 | "text": [
491 | "2025-03-05 16:27:41 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.\n",
492 | "2025-03-05 16:27:41 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau)."
493 | ]
494 | }
495 | ],
496 | "source": [
497 | "model.build(X_train, y_train, lightning=True, lr=parameters_train.get(\"lr\"))"
498 | ],
499 | "id": "e2e43d0e"
500 | },
501 | {
502 | "cell_type": "markdown",
503 | "metadata": {},
504 | "source": [
505 | "One can retrieve different objects from `model` instance:\n",
506 | "\n",
507 | "- `model.pytorch_model`\n",
508 | "- `model.tokenizer`\n",
509 | "- `model.lightning_module`"
510 | ],
511 | "id": "b5a7d5fa-596a-470b-892e-e8fafdb8221a"
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 13,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "output_type": "display_data",
520 | "metadata": {},
521 | "data": {
522 | "text/plain": [
523 | "FastTextModel(\n",
524 | " (embeddings): EmbeddingBag(107992, 50, mode='mean', padding_idx=107991)\n",
525 | " (emb_0): Embedding(24, 10)\n",
526 | " (emb_1): Embedding(40, 10)\n",
527 | " (emb_2): Embedding(8, 10)\n",
528 | " (emb_3): Embedding(13, 10)\n",
529 | " (emb_4): Embedding(3, 10)\n",
530 | " (emb_5): Embedding(4, 10)\n",
531 | " (fc): Linear(in_features=60, out_features=646, bias=True)\n",
532 | ")"
533 | ]
534 | }
535 | }
536 | ],
537 | "source": [
538 | "model.pytorch_model"
539 | ],
540 | "id": "091024e6"
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 14,
545 | "metadata": {},
546 | "outputs": [
547 | {
548 | "output_type": "display_data",
549 | "metadata": {},
550 | "data": {
551 | "text/plain": [
552 | ""
553 | ]
554 | }
555 | }
556 | ],
557 | "source": [
558 | "model.tokenizer"
559 | ],
560 | "id": "d983b113"
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 15,
565 | "metadata": {},
566 | "outputs": [
567 | {
568 | "output_type": "display_data",
569 | "metadata": {},
570 | "data": {
571 | "text/plain": [
572 | "FastTextModule(\n",
573 | " (model): FastTextModel(\n",
574 | " (embeddings): EmbeddingBag(107992, 50, mode='mean', padding_idx=107991)\n",
575 | " (emb_0): Embedding(24, 10)\n",
576 | " (emb_1): Embedding(40, 10)\n",
577 | " (emb_2): Embedding(8, 10)\n",
578 | " (emb_3): Embedding(13, 10)\n",
579 | " (emb_4): Embedding(3, 10)\n",
580 | " (emb_5): Embedding(4, 10)\n",
581 | " (fc): Linear(in_features=60, out_features=646, bias=True)\n",
582 | " )\n",
583 | " (loss): CrossEntropyLoss()\n",
584 | " (accuracy_fn): MulticlassAccuracy()\n",
585 | ")"
586 | ]
587 | }
588 | }
589 | ],
590 | "source": [
591 | "model.lightning_module"
592 | ],
593 | "id": "9b23f1ba"
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "metadata": {},
598 | "source": [
599 | "One can also retrieve more precise information regarding the tokenizer.\n",
600 | "This can be useful to know how text is parsed before being given to the\n",
601 | "neural network:"
602 | ],
603 | "id": "b804391a-979a-4a74-a5f7-d8e27550e20e"
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": 16,
608 | "metadata": {},
609 | "outputs": [
610 | {
611 | "output_type": "stream",
612 | "name": "stdout",
613 | "text": [
614 | "{0: '',\n",
615 | " 8097: 'lorem ipsum dolor',\n",
616 | " 8172: '',\n",
618 | " 8949: '',\n",
620 | " 15121: 'lorem>',\n",
621 | " 17369: 'ame',\n",
622 | " 18928: '',\n",
625 | " 21355: '',\n",
633 | " 33381: 'or>',\n",
634 | " 35841: 'ipsum dolor',\n",
635 | " 37380: '',\n",
642 | " 44394: '',\n",
644 | " 45738: 'sit>',\n",
645 | " 45871: 'ipsu',\n",
646 | " 48778: 'psu',\n",
647 | " 48931: 'orem>',\n",
648 | " 49786: '',\n",
651 | " 57345: 'it>',\n",
652 | " 57990: 'olor>',\n",
653 | " 60515: 'lor>',\n",
654 | " 60522: 'ore',\n",
655 | " 62809: 'sum>',\n",
656 | " 65472: 'met',\n",
657 | " 65559: '',\n",
659 | " 67778: 'olor',\n",
660 | " 67985: 'orem',\n",
661 | " 68529: 'psum',\n",
662 | " 69320: '',\n",
663 | " 72158: 'rem',\n",
664 | " 73818: 'ipsum>',\n",
665 | " 74637: 'dolor sit',\n",
666 | " 76593: 'lor',\n",
667 | " 77594: '',\n",
670 | " 87627: '',\n",
672 | " 92771: 'psum>',\n",
673 | " 92809: '', '', 'H '],\n",
818 | " ['', '', 'e '],\n",
819 | " ['', '', 'l '],\n",
820 | " ['', '', 'l '],\n",
821 | " ['', '', 'o '],\n",
822 | " [''],\n",
823 | " ['', '', 'w '],\n",
824 | " ['', '', 'o '],\n",
825 | " ['', '', 'r '],\n",
826 | " ['', '', 'l '],\n",
827 | " ['', '', 'd ']],\n",
828 | " [tensor([40876, 0, 51965]),\n",
829 | " tensor([51907, 0, 77296]),\n",
830 | " tensor([74312, 0, 26137]),\n",
831 | " tensor([74312, 0, 26137]),\n",
832 | " tensor([ 9853, 0, 53786]),\n",
833 | " tensor([0]),\n",
834 | " tensor([29925, 0, 74978]),\n",
835 | " tensor([ 9853, 0, 53786]),\n",
836 | " tensor([ 8646, 0, 13223]),\n",
837 | " tensor([74312, 0, 26137]),\n",
838 | " tensor([ 89472, 0, 104945])],\n",
839 | " [{40876: '', 0: '', 51965: 'H '},\n",
840 | " {51907: '', 0: '', 77296: 'e '},\n",
841 | " {74312: '', 0: '', 26137: 'l '},\n",
842 | " {74312: '', 0: '', 26137: 'l '},\n",
843 | " {9853: '', 0: '', 53786: 'o '},\n",
844 | " {0: ''},\n",
845 | " {29925: '', 0: '', 74978: 'w '},\n",
846 | " {9853: '', 0: '', 53786: 'o '},\n",
847 | " {8646: '', 0: '', 13223: 'r '},\n",
848 | " {74312: '', 0: '', 26137: 'l '},\n",
849 | " {89472: '', 0: '', 104945: 'd '}],\n",
850 | " [{'': 40876, '': 0, 'H ': 51965},\n",
851 | " {'': 51907, '': 0, 'e ': 77296},\n",
852 | " {'': 74312, '': 0, 'l ': 26137},\n",
853 | " {'': 74312, '': 0, 'l ': 26137},\n",
854 | " {'': 9853, '': 0, 'o ': 53786},\n",
855 | " {'': 0},\n",
856 | " {'': 29925, '': 0, 'w ': 74978},\n",
857 | " {'': 9853, '': 0, 'o ': 53786},\n",
858 | " {'': 8646, '': 0, 'r ': 13223},\n",
859 | " {'': 74312, '': 0, 'l ': 26137},\n",
860 | " {'': 89472, '': 0, 'd ': 104945}])"
861 | ]
862 | }
863 | }
864 | ],
865 | "source": [
866 | "tokenizer.tokenize(\"Hello world\")"
867 | ],
868 | "id": "0b4964f3"
869 | },
870 | {
871 | "cell_type": "markdown",
872 | "metadata": {},
873 | "source": [
874 | "However, there is a more straightforward way to do: creating directly\n",
875 | "the `NGramTokenizer` instance:"
876 | ],
877 | "id": "fd5b6899-7831-40a6-9841-bbc1b0804956"
878 | },
879 | {
880 | "cell_type": "code",
881 | "execution_count": 23,
882 | "metadata": {},
883 | "outputs": [],
884 | "source": [
885 | "tokenizer = NGramTokenizer(\n",
886 | " **parameters,\n",
887 | " training_text=training_text\n",
888 | " )"
889 | ],
890 | "id": "8a6ee96b"
891 | },
892 | {
893 | "cell_type": "code",
894 | "execution_count": 24,
895 | "metadata": {},
896 | "outputs": [
897 | {
898 | "output_type": "display_data",
899 | "metadata": {},
900 | "data": {
901 | "text/plain": [
902 | "([['', '', 'H '],\n",
903 | " ['', '', 'e '],\n",
904 | " ['', '', 'l '],\n",
905 | " ['', '', 'l '],\n",
906 | " ['', '', 'o '],\n",
907 | " [''],\n",
908 | " ['', '', 'w '],\n",
909 | " ['', '', 'o '],\n",
910 | " ['', '', 'r '],\n",
911 | " ['', '', 'l '],\n",
912 | " ['', '', 'd ']],\n",
913 | " [tensor([40876, 0, 51965]),\n",
914 | " tensor([51907, 0, 77296]),\n",
915 | " tensor([74312, 0, 26137]),\n",
916 | " tensor([74312, 0, 26137]),\n",
917 | " tensor([ 9853, 0, 53786]),\n",
918 | " tensor([0]),\n",
919 | " tensor([29925, 0, 74978]),\n",
920 | " tensor([ 9853, 0, 53786]),\n",
921 | " tensor([ 8646, 0, 13223]),\n",
922 | " tensor([74312, 0, 26137]),\n",
923 | " tensor([ 89472, 0, 104945])],\n",
924 | " [{40876: '', 0: '', 51965: 'H '},\n",
925 | " {51907: '', 0: '', 77296: 'e '},\n",
926 | " {74312: '', 0: '', 26137: 'l '},\n",
927 | " {74312: '', 0: '', 26137: 'l '},\n",
928 | " {9853: '', 0: '', 53786: 'o '},\n",
929 | " {0: ''},\n",
930 | " {29925: '', 0: '', 74978: 'w '},\n",
931 | " {9853: '', 0: '', 53786: 'o '},\n",
932 | " {8646: '', 0: '', 13223: 'r '},\n",
933 | " {74312: '', 0: '', 26137: 'l '},\n",
934 | " {89472: '', 0: '', 104945: 'd '}],\n",
935 | " [{'': 40876, '': 0, 'H ': 51965},\n",
936 | " {'': 51907, '': 0, 'e ': 77296},\n",
937 | " {'': 74312, '': 0, 'l ': 26137},\n",
938 | " {'': 74312, '': 0, 'l ': 26137},\n",
939 | " {'': 9853, '': 0, 'o ': 53786},\n",
940 | " {'': 0},\n",
941 | " {'': 29925, '': 0, 'w ': 74978},\n",
942 | " {'': 9853, '': 0, 'o ': 53786},\n",
943 | " {'': 8646, '': 0, 'r ': 13223},\n",
944 | " {'': 74312, '': 0, 'l ': 26137},\n",
945 | " {'': 89472, '': 0, 'd ': 104945}])"
946 | ]
947 | }
948 | }
949 | ],
950 | "source": [
951 | "tokenizer.tokenize(\"Hello world\")"
952 | ],
953 | "id": "776636e6"
954 | },
955 | {
956 | "cell_type": "markdown",
957 | "metadata": {},
958 | "source": [
959 | "Why creating a `NGramTokenizer` separately ? Because model constructor\n",
960 | "is now independent from training data:"
961 | ],
962 | "id": "6b0fd6c0-9740-4a32-9bb2-4a3cfe174ea8"
963 | },
964 | {
965 | "cell_type": "code",
966 | "execution_count": 26,
967 | "metadata": {},
968 | "outputs": [
969 | {
970 | "output_type": "stream",
971 | "name": "stderr",
972 | "text": [
973 | "2025-03-05 16:27:41 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.\n",
974 | "2025-03-05 16:27:42 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau)."
975 | ]
976 | }
977 | ],
978 | "source": [
979 | "model = torchFastText.build_from_tokenizer(\n",
980 | " tokenizer, \n",
981 | " embedding_dim=parameters[\"embedding_dim\"], \n",
982 | " categorical_embedding_dims=parameters[\"categorical_embedding_dims\"], \n",
983 | " sparse=parameters[\"sparse\"], \n",
984 | " lr=parameters_train[\"lr\"], \n",
985 | " num_classes=NUM_CLASSES, \n",
986 | " num_categorical_features=NUM_CAT_VAR, \n",
987 | " categorical_vocabulary_sizes=CAT_VOCAB_SIZE\n",
988 | ")"
989 | ],
990 | "id": "ee5dbe0b"
991 | },
992 | {
993 | "cell_type": "markdown",
994 | "metadata": {},
995 | "source": [
996 | "**Warning**:\n",
997 | "\n",
998 | "If the PyTorch model building did not use the training data, please keep\n",
999 | "in mind that its architecture (that you customize here) should match the\n",
1000 | "vocabulary size of the categorical variables and the total number of\n",
1001 | "class, otherwise the model will raise an error during training.\n",
1002 | "\n",
1003 | "# Train a torchFastText model directly\n",
1004 | "\n",
1005 | "If no advanced customization or PyTorch tuning is necessary, there is a\n",
1006 | "direct way of training model."
1007 | ],
1008 | "id": "f53080e9-9d78-479f-a446-2feb4a92b1de"
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "execution_count": 27,
1013 | "metadata": {},
1014 | "outputs": [],
1015 | "source": [
1016 | "model.train(\n",
1017 | " X_train,\n",
1018 | " y_train,\n",
1019 | " X_test,\n",
1020 | " y_test,\n",
1021 | " num_epochs=parameters_train['num_epochs'],\n",
1022 | " batch_size=parameters_train['batch_size'],\n",
1023 | " patience_scheduler=parameters_train['patience'],\n",
1024 | " patience_train=parameters_train['patience'],\n",
1025 | " lr=parameters_train['lr'],\n",
1026 | " verbose = True\n",
1027 | ")"
1028 | ],
1029 | "id": "ce5dc4a1"
1030 | },
1031 | {
1032 | "cell_type": "markdown",
1033 | "metadata": {},
1034 | "source": [
1035 | "# Load a trained model from a Lightning checkpoint\n",
1036 | "\n",
1037 | "/! TOCOMPLETE"
1038 | ],
1039 | "id": "919b67ed-4a65-4c26-92a9-771a4be3cd15"
1040 | },
1041 | {
1042 | "cell_type": "code",
1043 | "execution_count": 28,
1044 | "metadata": {},
1045 | "outputs": [],
1046 | "source": [
1047 | "model.load_from_checkpoint(model.best_model_path) # or any other checkpoint path (string)"
1048 | ],
1049 | "id": "f560047b"
1050 | },
1051 | {
1052 | "cell_type": "markdown",
1053 | "metadata": {},
1054 | "source": [
1055 | "# Predicting from new labels"
1056 | ],
1057 | "id": "e521a23b-77c4-4b0c-9940-a17b19b8111d"
1058 | },
1059 | {
1060 | "cell_type": "code",
1061 | "execution_count": 29,
1062 | "metadata": {},
1063 | "outputs": [],
1064 | "source": [
1065 | "text = [\"coiffeur, boulangerie, pâtisserie\"] # one text description\n",
1066 | "X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry\n",
1067 | "TOP_K = 5\n",
1068 | "\n",
1069 | "pred, conf = model.predict(X, top_k=TOP_K)\n",
1070 | "pred_naf = encoder.inverse_transform(pred.reshape(-1))\n",
1071 | "subset = naf2008.set_index(\"code\").loc[np.flip(pred_naf)]\n",
1072 | "\n",
1073 | "for i in range(TOP_K-1, -1, -1):\n",
1074 | " print(f\"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}\")"
1075 | ],
1076 | "id": "dbbad77d"
1077 | },
1078 | {
1079 | "cell_type": "markdown",
1080 | "metadata": {},
1081 | "source": [
1082 | "# Explainability"
1083 | ],
1084 | "id": "f84e6bff-8fa7-4896-b60a-005ae5f1d3eb"
1085 | },
1086 | {
1087 | "cell_type": "code",
1088 | "execution_count": 30,
1089 | "metadata": {},
1090 | "outputs": [],
1091 | "source": [
1092 | "from torchFastText.explainability.visualisation import (\n",
1093 | " visualize_letter_scores,\n",
1094 | " visualize_word_scores,\n",
1095 | ")\n",
1096 | "\n",
1097 | "pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)\n",
1098 | "visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))\n",
1099 | "visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))"
1100 | ],
1101 | "id": "58c46021"
1102 | }
1103 | ],
1104 | "nbformat": 4,
1105 | "nbformat_minor": 5,
1106 | "metadata": {
1107 | "kernelspec": {
1108 | "name": "python3",
1109 | "display_name": "Python 3 (ipykernel)",
1110 | "language": "python",
1111 | "path": "/opt/conda/share/jupyter/kernels/python3"
1112 | },
1113 | "language_info": {
1114 | "name": "python",
1115 | "codemirror_mode": {
1116 | "name": "ipython",
1117 | "version": "3"
1118 | },
1119 | "file_extension": ".py",
1120 | "mimetype": "text/x-python",
1121 | "nbconvert_exporter": "python",
1122 | "pygments_lexer": "ipython3",
1123 | "version": "3.12.7"
1124 | }
1125 | }
1126 | }
--------------------------------------------------------------------------------
/notebooks/example.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Exemple d'utilisation de la librairie `TorchFastText`"
3 | ---
4 |
5 |
6 |
7 | _Warning_
8 |
9 | _`TorchFastText` library is still under active development. Have a regular look to [https://github.com/inseefrlab/torch-fastText](https://github.com/inseefrlab/torch-fastText) for latest information._
10 |
11 | To install package, you can run the following snippet
12 |
13 | ```{python}
14 | #| output: false
15 | #| eval: false
16 |
17 | # Stable version
18 | pip install torchFastText
19 | # Development version
20 | # pip install !https://github.com/InseeFrLab/torch-fastText.git
21 | ```
22 |
23 | # Load and preprocess data
24 |
25 | In that guide, we propose to illustrate main package functionalities using that `DataFrame`:
26 |
27 | ```{python}
28 | import pandas as pd
29 | df = pd.read_parquet("https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet")
30 | df = df.sample(10000)
31 | ```
32 |
33 | Our goal will be to build multilabel classification for the `code` variable using `libelle` as feature.
34 |
35 | ## Enriching our test dataset
36 |
37 | Unlike `Fasttext`, this package offers the possibility of having several feature columns of different types (string for the text column and additional variables in numeric form, for example). To illustrate that, we propose the following enrichment of the example dataset:
38 |
39 |
40 | ```{python}
41 | import pandas as pd
42 | import numpy as np
43 | from sklearn.model_selection import train_test_split
44 | from sklearn.preprocessing import LabelEncoder
45 |
46 | def categorize_surface(
47 | df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True
48 | ) -> pd.DataFrame:
49 | """
50 | Categorize the surface of the activity.
51 |
52 | Args:
53 | df (pd.DataFrame): DataFrame to categorize.
54 | surface_feature_name (str): Name of the surface feature.
55 | like_sirene_3 (bool): If True, categorize like Sirene 3.
56 |
57 | Returns:
58 | pd.DataFrame: DataFrame with a new column "surf_cat".
59 | """
60 | df_copy = df.copy()
61 | df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan)
62 | df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)
63 | # Check surface feature exists
64 | if surface_feature_name not in df.columns:
65 | raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.")
66 | # Check surface feature is a float variable
67 | if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):
68 | raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.")
69 |
70 | if like_sirene_3:
71 | # Categorize the surface
72 | df_copy["surf_cat"] = pd.cut(
73 | df_copy[surface_feature_name],
74 | bins=[0, 120, 400, 2500, np.inf],
75 | labels=["1", "2", "3", "4"],
76 | ).astype(str)
77 | else:
78 | # Log transform the surface
79 | df_copy["surf_log"] = np.log(df[surface_feature_name])
80 |
81 | # Categorize the surface
82 | df_copy["surf_cat"] = pd.cut(
83 | df_copy.surf_log,
84 | bins=[0, 3, 4, 5, 12],
85 | labels=["1", "2", "3", "4"],
86 | ).astype(str)
87 |
88 | df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0")
89 | df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)
90 | df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore")
91 | return df_copy
92 |
93 |
94 | def clean_and_tokenize_df(
95 | df,
96 | categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"],
97 | text_feature="libelle_processed",
98 | label_col="apet_finale",
99 | ):
100 | df.fillna("nan", inplace=True)
101 |
102 | df = df.rename(
103 | columns={
104 | "evenement_type": "EVT",
105 | "cj": "CJ",
106 | "activ_nat_et": "NAT",
107 | "liasse_type": "TYP",
108 | "activ_surf_et": "SRF",
109 | "activ_perm_et": "CRT",
110 | }
111 | )
112 |
113 | les = []
114 | for col in categorical_features:
115 | le = LabelEncoder()
116 | df[col] = le.fit_transform(df[col])
117 | les.append(le)
118 |
119 | df = categorize_surface(df, "SRF", like_sirene_3=True)
120 | df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]]
121 |
122 | return df, les
123 |
124 |
125 | def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):
126 | # Get unique labels and their frequencies
127 | unique_labels, label_counts = np.unique(y, return_counts=True)
128 |
129 | # Separate rare and common labels
130 | rare_labels = unique_labels[label_counts == 1]
131 |
132 | # Create initial mask for rare labels to go into training set
133 | rare_label_mask = np.isin(y, rare_labels)
134 |
135 | # Separate data into rare and common label datasets
136 | X_rare = X[rare_label_mask]
137 | y_rare = y[rare_label_mask]
138 | X_common = X[~rare_label_mask]
139 | y_common = y[~rare_label_mask]
140 |
141 | # Split common labels stratified
142 | X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(
143 | X_common, y_common, test_size=test_size, stratify=y_common
144 | )
145 |
146 | # Combine rare labels with common labels split
147 | X_train = np.concatenate([X_rare, X_common_train])
148 | y_train = np.concatenate([y_rare, y_common_train])
149 | X_test = X_common_test
150 | y_test = y_common_test
151 |
152 | return X_train, X_test, y_train, y_test
153 |
154 | def add_libelles(
155 | df: pd.DataFrame,
156 | df_naf: pd.DataFrame,
157 | y: str,
158 | text_feature: str,
159 | textual_features: list,
160 | categorical_features: list,
161 | ):
162 | missing_codes = set(df_naf["code"])
163 | fake_obs = df_naf[df_naf["code"].isin(missing_codes)]
164 | fake_obs[y] = fake_obs["code"]
165 | fake_obs[text_feature] = fake_obs[[text_feature]].apply(
166 | lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1
167 | )
168 | df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])
169 |
170 | if textual_features is not None:
171 | for feature in textual_features:
172 | df[feature] = df[feature].fillna(value="")
173 | if categorical_features is not None:
174 | for feature in categorical_features:
175 | df[feature] = df[feature].fillna(value="NaN")
176 |
177 | print(f"\t*** {len(missing_codes)} codes have been added in the database...\n")
178 | return df
179 | ```
180 |
181 | ```{python}
182 | categorical_features = ["evenement_type", "cj", "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"]
183 | text_feature = "libelle"
184 | y = "apet_finale"
185 | textual_features = None
186 |
187 | naf2008 = pd.read_csv("https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv", sep=";")
188 | df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)
189 | ```
190 |
191 |
192 | ## Preprocessing
193 |
194 | To reduce noise in text fields, we recommend pre-processing before training a model with our package. We assume this preprocessing is handled by the package user : this gives him the opportunity to control data cleansing.
195 |
196 | Here's an example of the type of preprocessing that can be carried out before moving on to the modeling phase
197 |
198 | ```{python}
199 | from torchFastText.preprocess import clean_text_feature
200 | df["libelle_processed"] = clean_text_feature(df["libelle"])
201 | ```
202 |
203 | Right now, the model requires the label (variable y) to be a numerical variable. If the label variable is a text variable, we recommend using Scikit Learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) to convert into a numeric variable. Using that function will give user the possibility to get back labels from the encoder after running predictions.
204 |
205 | ```{python}
206 | encoder = LabelEncoder()
207 | df["apet_finale"] = encoder.fit_transform(df["apet_finale"])
208 | ```
209 |
210 | The function `clean_and_tokenize_df` requires special `DataFrame` formatting:
211 |
212 | - First column contains the processed text (str)
213 | - Next ones contain the "encoded" categorical (discrete) variables in int format
214 |
215 |
216 | ```{python}
217 | df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed")
218 | X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values
219 | y = df["apet_finale"].values
220 | ```
221 |
222 | ## Splitting in train-test sets
223 |
224 | As usual in a learning approach, you need to break down your data into learning and test/validation samples to obtain robust performance statistics.
225 |
226 | This work is the responsibility of the package's users. Here's an example of how to do it, using the [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function in `Scikit`.
227 |
228 | ```{python}
229 | from sklearn.model_selection import train_test_split
230 | X_train, X_test, y_train, y_test = train_test_split(X, y)
231 | ```
232 |
233 | # Build the torch-fastText model (without training it)
234 |
235 | There are several ways to define and train a pytorch.fasttext model in this package.
236 |
237 | We first show how to initialize the model and then afterwars build it.
238 |
239 | `torchFastText` function accepts the following parameters:
240 |
241 | | Parameter | Meaning | Example Value |
242 | |---------------------------------|---------------------------------------------------------------------|--------------|
243 | | `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 |
244 | | `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 |
245 | | `sparse` | Use sparse embedding for fast computation (PyTorch) | False |
246 | | `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 |
247 | | `min_count` | Minimum occurrences of a word in the corpus to be included | 1 |
248 | | `min_n` | Minimum length of character n-grams | 3 |
249 | | `max_n` | Maximum length of character n-grams | 6 |
250 | | `len_word_ngrams` | Length of word n-grams | 3 |
251 |
252 |
253 | ```{python}
254 | from torchFastText import torchFastText
255 |
256 | parameters = {
257 | "num_tokens": 100000,
258 | "embedding_dim": 50,
259 | "sparse": False,
260 | "categorical_embedding_dims": 10,
261 | "min_count": 1,
262 | "min_n": 3,
263 | "max_n": 6,
264 | "len_word_ngrams": 3,
265 | }
266 |
267 | parameters_train = {
268 | "lr": 0.004,
269 | "num_epochs": 1,
270 | "batch_size": 256,
271 | "patience": 3
272 | }
273 |
274 | model = torchFastText(**parameters)
275 | ```
276 |
277 | `model` is then a special `torchFastText` object:
278 |
279 | ```{python}
280 | type(model)
281 | ```
282 |
283 | As any `PyTorch` model, it accepts being save as a JSON for later on use:
284 |
285 | ```{python}
286 | model.to_json('torchFastText_config.json')
287 | # model = torchFastText.from_json('torchFastText_config.json')
288 | ```
289 |
290 | We can apply `build` to finally train our model. These are the parameters accepted by the `build` method
291 |
292 | | Parameter | Meaning | Example Value |
293 | |---------------------------------|---------------------------------------------------------------------|--------------|
294 | | `lr` | Learning rate | 0.004 |
295 | | `num_epochs` | Number of training epochs | 1 |
296 | | `batch_size` | Batch size for training | 256 |
297 | | `patience` | Early stopping patience (number of epochs without improvement) | 3 |
298 |
299 |
300 | We build the model using the training data.
301 | We have now access to the tokenizer, the PyTorch model as well as a PyTorch Lightning module ready to be trained.
302 | Note that Lightning is high-level framework for PyTorch that simplifies the process of training, validating, and deploying machine learning models.
303 |
304 |
305 | ```{python}
306 | model.build(X_train, y_train, lightning=True, lr=parameters_train.get("lr"))
307 | ```
308 |
309 | One can retrieve different objects from `model` instance:
310 |
311 | * `model.pytorch_model`
312 | * `model.tokenizer`
313 | * `model.lightning_module`
314 |
315 |
316 | ```{python}
317 | model.pytorch_model
318 | ```
319 |
320 | ```{python}
321 | model.tokenizer
322 | ```
323 |
324 | ```{python}
325 | model.lightning_module
326 | ```
327 |
328 | One can also retrieve more precise information regarding the tokenizer. This can be useful to know how text is parsed before being given to the neural network:
329 |
330 |
331 | ```{python}
332 | from pprint import pprint
333 | sentence = ["lorem ipsum dolor sit amet"]
334 | pprint(model.tokenizer.tokenize(sentence)[2][0])
335 | ```
336 |
337 |
338 | Saving parameters to JSON can also be done after building, but the model needs to be rebuilt after loading.
339 |
340 | ```{python}
341 | model.to_json('torchFastText_config.json')
342 | ```
343 |
344 |
345 | ## Alternative way to build torchFastText
346 |
347 | The training data is only useful to initialize the tokenizer, but X_train and y_train are not needed to initialize the PyTorch model, provided we give the right parameters to construct layer.
348 |
349 | To highlight this, we provide a lower-level process to build the model where one can first build the tokenizer, and then build the model with custom architecture parameters.
350 |
351 | The tokenizer can be loaded **from the same JSON file** as the model parameters, or initialized using the right arguments.
352 |
353 |
354 | ```{python}
355 | del model
356 | ```
357 |
358 | Let's decompose our features in two group:
359 |
360 | * We have our textual feature stored in the first column of the features matrix
361 | * All other columns are categorical variables
362 |
363 | ```{python}
364 | training_text = X_train[:, 0].tolist()
365 | categorical_variables = X_train[:, 1:]
366 | ```
367 |
368 | We need to create a few variables that will be useful afterwards
369 |
370 | ```{python}
371 | CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist()
372 | NUM_CLASSES = len(np.unique(y_train))
373 | NUM_CAT_VAR = categorical_variables.shape[1]
374 | ```
375 |
376 | Now let's come to the nitty gritty. There are several ways to create an instance of the tokenizer.
377 |
378 | First, we can create the tokenizer from :
379 |
380 | * model definition in the JSON file created beforehand
381 | * textual data in training dataset
382 |
383 | ```{python}
384 | from torchFastText.datasets import NGramTokenizer
385 | tokenizer = NGramTokenizer.from_json('torchFastText_config.json', training_text)
386 | ```
387 |
388 | ```{python}
389 | tokenizer.tokenize("Hello world")
390 | ```
391 |
392 | However, there is a more straightforward way to do: creating directly the `NGramTokenizer` instance:
393 |
394 |
395 | ```{python}
396 | tokenizer = NGramTokenizer(
397 | **parameters,
398 | training_text=training_text
399 | )
400 | ```
401 |
402 | ```{python}
403 | tokenizer.tokenize("Hello world")
404 | ```
405 |
406 | Why creating a `NGramTokenizer` separately ? Because model constructor is now independent from training data:
407 |
408 | ```{python}
409 | #| echo: false
410 | #| eval: false
411 | # TODO : allow to do that
412 | #torchFastText.build_from_tokenizer(
413 | #tokenizer,
414 | #**parameters,
415 | #**parameters_build
416 | # )
417 | ```
418 |
419 | ```{python}
420 | model = torchFastText.build_from_tokenizer(
421 | tokenizer,
422 | embedding_dim=parameters["embedding_dim"],
423 | categorical_embedding_dims=parameters["categorical_embedding_dims"],
424 | sparse=parameters["sparse"],
425 | lr=parameters_train["lr"],
426 | num_classes=NUM_CLASSES,
427 | num_categorical_features=NUM_CAT_VAR,
428 | categorical_vocabulary_sizes=CAT_VOCAB_SIZE
429 | )
430 | ```
431 |
432 | __Warning__:
433 |
434 | If the PyTorch model building did not use the training data, please keep in mind that its architecture (that you customize here) should match the vocabulary size of the categorical variables and the total number of class, otherwise the model will raise an error during training.
435 |
436 |
437 | # Train a torchFastText model directly
438 |
439 | If no advanced customization or PyTorch tuning is necessary, there is a direct way of training model.
440 |
441 |
442 | ```{python}
443 | #| eval: false
444 | model.train(
445 | X_train,
446 | y_train,
447 | X_test,
448 | y_test,
449 | num_epochs=parameters_train['num_epochs'],
450 | batch_size=parameters_train['batch_size'],
451 | patience_scheduler=parameters_train['patience'],
452 | patience_train=parameters_train['patience'],
453 | lr=parameters_train['lr'],
454 | verbose = True
455 | )
456 | ```
457 |
458 | # Load a trained model from a Lightning checkpoint
459 |
460 | /!\ TOCOMPLETE
461 |
462 |
463 | ```{python}
464 | #| eval: false
465 | model.load_from_checkpoint(model.best_model_path) # or any other checkpoint path (string)
466 | ```
467 |
468 | # Predicting from new labels
469 |
470 |
471 | ```{python}
472 | #| eval: false
473 | text = ["coiffeur, boulangerie, pâtisserie"] # one text description
474 | X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry
475 | TOP_K = 5
476 |
477 | pred, conf = model.predict(X, top_k=TOP_K)
478 | pred_naf = encoder.inverse_transform(pred.reshape(-1))
479 | subset = naf2008.set_index("code").loc[np.flip(pred_naf)]
480 |
481 | for i in range(TOP_K-1, -1, -1):
482 | print(f"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}")
483 |
484 | ```
485 |
486 | # Explainability
487 |
488 |
489 | ```{python}
490 | #| eval: false
491 | from torchFastText.explainability.visualisation import (
492 | visualize_letter_scores,
493 | visualize_word_scores,
494 | )
495 |
496 | pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)
497 | visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))
498 | visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))
499 | ```
--------------------------------------------------------------------------------
/notebooks/torchFastText_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "embedding_dim": 50,
3 | "sparse": false,
4 | "num_tokens": 100000,
5 | "min_count": 1,
6 | "min_n": 3,
7 | "max_n": 6,
8 | "len_word_ngrams": 3,
9 | "num_classes": 646,
10 | "num_rows": 107992,
11 | "categorical_vocabulary_sizes": [
12 | 24,
13 | 40,
14 | 8,
15 | 13,
16 | 3,
17 | 4
18 | ],
19 | "categorical_embedding_dims": 10,
20 | "num_categorical_features": 6,
21 | "direct_bagging": true
22 | }
--------------------------------------------------------------------------------
/notebooks/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.preprocessing import LabelEncoder
5 |
6 | def categorize_surface(
7 | df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True
8 | ) -> pd.DataFrame:
9 | """
10 | Categorize the surface of the activity.
11 |
12 | Args:
13 | df (pd.DataFrame): DataFrame to categorize.
14 | surface_feature_name (str): Name of the surface feature.
15 | like_sirene_3 (bool): If True, categorize like Sirene 3.
16 |
17 | Returns:
18 | pd.DataFrame: DataFrame with a new column "surf_cat".
19 | """
20 | df_copy = df.copy()
21 | df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan)
22 | df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)
23 | # Check surface feature exists
24 | if surface_feature_name not in df.columns:
25 | raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.")
26 | # Check surface feature is a float variable
27 | if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):
28 | raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.")
29 |
30 | if like_sirene_3:
31 | # Categorize the surface
32 | df_copy["surf_cat"] = pd.cut(
33 | df_copy[surface_feature_name],
34 | bins=[0, 120, 400, 2500, np.inf],
35 | labels=["1", "2", "3", "4"],
36 | ).astype(str)
37 | else:
38 | # Log transform the surface
39 | df_copy["surf_log"] = np.log(df[surface_feature_name])
40 |
41 | # Categorize the surface
42 | df_copy["surf_cat"] = pd.cut(
43 | df_copy.surf_log,
44 | bins=[0, 3, 4, 5, 12],
45 | labels=["1", "2", "3", "4"],
46 | ).astype(str)
47 |
48 | df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0")
49 | df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)
50 | df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore")
51 | return df_copy
52 |
53 |
54 | def clean_and_tokenize_df(
55 | df,
56 | categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"],
57 | text_feature="libelle_processed",
58 | label_col="apet_finale",
59 | ):
60 | df.fillna("nan", inplace=True)
61 |
62 | df = df.rename(
63 | columns={
64 | "evenement_type": "EVT",
65 | "cj": "CJ",
66 | "activ_nat_et": "NAT",
67 | "liasse_type": "TYP",
68 | "activ_surf_et": "SRF",
69 | "activ_perm_et": "CRT",
70 | }
71 | )
72 |
73 | les = []
74 | for col in categorical_features:
75 | le = LabelEncoder()
76 | df[col] = le.fit_transform(df[col])
77 | les.append(le)
78 |
79 | df = categorize_surface(df, "SRF", like_sirene_3=True)
80 | df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]]
81 |
82 | return df, les
83 |
84 |
85 | def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):
86 | # Get unique labels and their frequencies
87 | unique_labels, label_counts = np.unique(y, return_counts=True)
88 |
89 | # Separate rare and common labels
90 | rare_labels = unique_labels[label_counts == 1]
91 |
92 | # Create initial mask for rare labels to go into training set
93 | rare_label_mask = np.isin(y, rare_labels)
94 |
95 | # Separate data into rare and common label datasets
96 | X_rare = X[rare_label_mask]
97 | y_rare = y[rare_label_mask]
98 | X_common = X[~rare_label_mask]
99 | y_common = y[~rare_label_mask]
100 |
101 | # Split common labels stratified
102 | X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(
103 | X_common, y_common, test_size=test_size, stratify=y_common
104 | )
105 |
106 | # Combine rare labels with common labels split
107 | X_train = np.concatenate([X_rare, X_common_train])
108 | y_train = np.concatenate([y_rare, y_common_train])
109 | X_test = X_common_test
110 | y_test = y_common_test
111 |
112 | return X_train, X_test, y_train, y_test
113 |
114 | def add_libelles(
115 | df: pd.DataFrame,
116 | df_naf: pd.DataFrame,
117 | y: str,
118 | text_feature: str,
119 | textual_features: list,
120 | categorical_features: list,
121 | ):
122 | missing_codes = set(df_naf["code"])
123 | fake_obs = df_naf[df_naf["code"].isin(missing_codes)]
124 | fake_obs[y] = fake_obs["code"]
125 | fake_obs[text_feature] = fake_obs[[text_feature]].apply(
126 | lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1
127 | )
128 | df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])
129 |
130 | if textual_features is not None:
131 | for feature in textual_features:
132 | df[feature] = df[feature].fillna(value="")
133 | if categorical_features is not None:
134 | for feature in categorical_features:
135 | df[feature] = df[feature].fillna(value="NaN")
136 |
137 | print(f"\t*** {len(missing_codes)} codes have been added in the database...\n")
138 | return df
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "torchFastText"
3 | description = "An implementation of the https://github.com/facebookresearch/fastText supervised learning algorithm for text classification using Pytorch."
4 | authors = [
5 | { name = "Tom Seimandi", email = "tom.seimandi@gmail.com" },
6 | { name = "Julien Pramil", email = "julien.pramil@insee.fr" },
7 | { name = "Meilame Tayebjee", email = "meilame.tayebjee@insee.fr" },
8 | { name = "Cédric Couralet", email = "cedric.couralet@insee.fr" },
9 | ]
10 | readme = "README.md"
11 | repository = "https://github.com/InseeFrLab/torch-fastText"
12 | classifiers = [
13 | "Programming Language :: Python :: 3",
14 | "License :: OSI Approved :: MIT License",
15 | "Operating System :: OS Independent",
16 | ]
17 | keywords = ["fastText", "text classification", "NLP", "automatic coding", "deep learning"]
18 | dependencies = [
19 | "numpy>=1.26.4",
20 | "pytorch-lightning>=2.4.0"
21 | ]
22 | requires-python = ">=3.10"
23 | dynamic = ["version"]
24 |
25 |
26 | [project.optional-dependencies]
27 | explainability = ["unidecode", "nltk", "captum"]
28 | preprocess = ["unidecode", "nltk"]
29 |
30 | [build-system]
31 | requires = ["poetry-core>=2.0.0"]
32 | build-backend = "poetry.core.masonry.api"
33 |
34 | [tool.ruff]
35 | line-length = 100
36 |
37 | [tool.poetry]
38 | version = "0.0.1-dev" # base version
39 | packages = [{include = "torchFastText"}]
40 |
41 | [tool.poetry.requires-plugins]
42 | poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }
43 |
44 | [tool.poetry-dynamic-versioning]
45 | enable = true
46 | vcs = "git"
47 | style = "semver"
48 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 | "extends": [
4 | "config:recommended"
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | pytorch_lightning
3 | numpy
4 | pandas
5 | scikit-learn
6 | pyarrow
7 | nltk
8 | unidecode
9 | captum
10 | ipywidgets
11 | seaborn
12 | ruff>=0.7.1
13 | pre-commit
14 | pytest
15 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InseeFrLab/torch-fastText/6b9aeb770033af311c2558799b07d14720f42f94/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_all.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from pathlib import Path
3 | from itertools import product
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.preprocessing import LabelEncoder
9 |
10 | from torchFastText import torchFastText
11 | from torchFastText.preprocess import clean_text_feature
12 | from torchFastText.datasets import NGramTokenizer
13 |
14 | source_path = Path(__file__).resolve()
15 | source_dir = source_path.parent
16 |
17 |
18 | @pytest.fixture(scope="session", autouse=True)
19 | def data():
20 | data = {
21 | "Catégorie": [
22 | "Politique",
23 | "Politique",
24 | "Politique",
25 | "Politique",
26 | "Politique",
27 | "Politique",
28 | "Politique",
29 | "Politique",
30 | "International",
31 | "International",
32 | "International",
33 | "International",
34 | "International",
35 | "International",
36 | "International",
37 | "International",
38 | "Célébrités",
39 | "Célébrités",
40 | "Célébrités",
41 | "Célébrités",
42 | "Célébrités",
43 | "Célébrités",
44 | "Célébrités",
45 | "Célébrités",
46 | "Sport",
47 | "Sport",
48 | "Sport",
49 | "Sport",
50 | "Sport",
51 | "Sport",
52 | "Sport",
53 | "Sport",
54 | ],
55 | "Titre": [
56 | "Nouveau budget présenté par le gouvernement",
57 | "Élections législatives : les principaux candidats en lice",
58 | "Réforme de la santé : les réactions des syndicats",
59 | "Nouvelle loi sur l'éducation : les points clés",
60 | "Les impacts des élections municipales sur la politique nationale",
61 | "Réforme des retraites : les enjeux et débats",
62 | "Nouveau plan de relance économique annoncé",
63 | "La gestion de la crise climatique par le gouvernement",
64 | "Accord climatique mondial : les engagements renouvelés",
65 | "Conflit au Moyen-Orient : nouvelles tensions",
66 | "Économie mondiale : les prévisions pour 2025",
67 | "Sommet international sur la paix : les résultats",
68 | "Répercussions des nouvelles sanctions économiques",
69 | "Les négociations commerciales entre les grandes puissances",
70 | "Les défis de la diplomatie moderne",
71 | "Les conséquences du Brexit sur l'Europe",
72 | "La dernière interview de [Nom de la célébrité]",
73 | "Les révélations de [Nom de la célébrité] sur sa vie privée",
74 | "Le retour sur scène de [Nom de la célébrité]",
75 | "La nouvelle romance de [Nom de la célébrité]",
76 | "Les scandales récents dans l'industrie du divertissement",
77 | "Les projets humanitaires de [Nom de la célébrité]",
78 | "La carrière impressionnante de [Nom de la célébrité]",
79 | "Les derniers succès cinématographiques de [Nom de la célébrité]",
80 | "Le championnat du monde de football : les favoris",
81 | "Record battu par [Nom de l'athlète] lors des Jeux Olympiques",
82 | "La finale de la Coupe de France : qui remportera le trophée?",
83 | "Les transferts les plus chers de la saison",
84 | "Les performances des athlètes français aux championnats du monde",
85 | "Les nouveaux talents à surveiller dans le monde du sport",
86 | "L'impact de la technologie sur les sports traditionnels",
87 | "Les grandes compétitions sportives de l'année à venir",
88 | ],
89 | }
90 | df = pd.DataFrame(data)
91 | labelEncoder = LabelEncoder()
92 | y = labelEncoder.fit_transform(df["Catégorie"])
93 | df["Titre_cleaned"] = clean_text_feature(df["Titre"])
94 | X_train, X_test, y_train, y_test = train_test_split(
95 | df["Titre_cleaned"], y, test_size=0.1, stratify=y
96 | )
97 | return X_train, X_test, y_train, y_test
98 |
99 |
100 | def test_building(data):
101 | num_tokens = 4
102 | embedding_dim = 10
103 | min_count = 1
104 | min_n = 2
105 | max_n = 5
106 | len_word_ngrams = 2
107 | sparse = False
108 | vocab_possible_values = [None, [5, 6, 7, 8]]
109 | cat_embedding_dim_possible_values = [[10, 20, 3, 7], 10, None]
110 | num_cat_possible_values = [None, 4]
111 | for vocab, cat_embedding_dim, num_cat in product(
112 | vocab_possible_values, cat_embedding_dim_possible_values, num_cat_possible_values
113 | ):
114 | model = torchFastText(
115 | num_tokens=num_tokens,
116 | num_rows=num_tokens,
117 | embedding_dim=embedding_dim,
118 | min_count=min_count,
119 | min_n=min_n,
120 | max_n=max_n,
121 | len_word_ngrams=len_word_ngrams,
122 | sparse=sparse,
123 | categorical_embedding_dims=cat_embedding_dim,
124 | categorical_vocabulary_sizes=vocab,
125 | num_categorical_features=num_cat,
126 | num_classes=3,
127 | )
128 | model._build_pytorch_model()
129 | assert True, "Model building completed without errors"
130 |
131 |
132 | def test_training(data):
133 | num_tokens = 4
134 | embedding_dim = 10
135 | min_count = 1
136 | min_n = 2
137 | max_n = 5
138 | len_word_ngrams = 2
139 | sparse = False
140 | vocab_possible_values = [None, [5, 6, 7, 8]]
141 | cat_embedding_dim_possible_values = [[10, 20, 3, 7], 10, None]
142 | num_cat_possible_values = [None, 4]
143 |
144 | X_train, X_test, y_train, y_test = data
145 |
146 | cat_data_train = np.random.randint(0, 4, (len(X_train), 4))
147 | cat_data_test = np.random.randint(0, 4, (len(X_test), 4))
148 | X_train_full = np.concatenate([np.asarray(X_train).reshape(-1, 1), cat_data_train], axis=1)
149 | X_test_full = np.concatenate([np.asarray(X_test).reshape(-1, 1), cat_data_test], axis=1)
150 | no_cat_var_model = None
151 | for vocab, cat_embedding_dim, num_cat in product(
152 | vocab_possible_values, cat_embedding_dim_possible_values, num_cat_possible_values
153 | ):
154 | print(vocab, cat_embedding_dim, num_cat)
155 | model = torchFastText(
156 | num_tokens=num_tokens,
157 | embedding_dim=embedding_dim,
158 | min_count=min_count,
159 | min_n=min_n,
160 | max_n=max_n,
161 | len_word_ngrams=len_word_ngrams,
162 | sparse=sparse,
163 | categorical_embedding_dims=cat_embedding_dim,
164 | categorical_vocabulary_sizes=vocab,
165 | num_categorical_features=num_cat,
166 | num_classes=3,
167 | )
168 | if (vocab is None) and (cat_embedding_dim is None) and (num_cat is None):
169 | model.train(
170 | np.asarray(X_train),
171 | np.asarray(y_train),
172 | np.asarray(X_test),
173 | np.asarray(y_test),
174 | num_epochs=1,
175 | batch_size=32,
176 | lr=0.001,
177 | num_workers=4,
178 | )
179 | no_cat_var_model = model
180 | else:
181 | model.train(
182 | np.asarray(X_train_full),
183 | np.asarray(y_train),
184 | np.asarray(X_test_full),
185 | np.asarray(y_test),
186 | num_epochs=1,
187 | batch_size=32,
188 | lr=0.001,
189 | num_workers=4,
190 | )
191 | print("done")
192 | assert True, "Training completed without errors"
193 | tokenizer = model.tokenizer
194 | tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts = (
195 | tokenizer.tokenize(["Nouveau budget présenté par le gouvernement"])
196 | )
197 | assert isinstance(tokenized_text, list)
198 | assert len(tokenized_text) > 0
199 |
200 | predictions, confidence, all_scores, all_scores_letters = model.predict_and_explain(
201 | np.asarray(["Nouveau budget présenté par le gouvernement"] + [0] * 4).reshape(1, -1), 2
202 | )
203 | assert predictions.shape == (1, 2)
204 |
205 | predictions, confidence, all_scores, all_scores_letters = no_cat_var_model.predict_and_explain(
206 | np.asarray(["Nouveau budget présenté par le gouvernement"]), 2
207 | )
208 | assert predictions.shape == (1, 2)
209 | # "predictions" contains the predicted class for each input text, in int format. Need to decode back to have the string format
210 |
--------------------------------------------------------------------------------
/tests/test_fasttext_model_dataset.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from torchFastText.datasets.dataset import FastTextModelDataset
3 | from torchFastText.datasets.tokenizer import NGramTokenizer
4 |
5 |
6 | @pytest.fixture
7 | def dataset():
8 | categorical_variables = [[1, 2], [3, 4], [5, 6]]
9 | texts = ["This is a test", "Another test", "Yet another test"]
10 | outputs = [0, 1, 0]
11 | tokenizer = NGramTokenizer(
12 | num_tokens=5,
13 | min_count=1,
14 | min_n=2,
15 | max_n=3,
16 | buckets=100,
17 | len_word_ngrams=2,
18 | training_text=texts,
19 | )
20 | return FastTextModelDataset(
21 | categorical_variables=categorical_variables,
22 | texts=texts,
23 | outputs=outputs,
24 | tokenizer=tokenizer,
25 | )
26 |
27 |
28 | def test_getitem(dataset):
29 | text, cat_variable, y = dataset[0]
30 | assert text == "This is a test"
31 | assert cat_variable == [1, 2]
32 | assert y == 0
33 |
--------------------------------------------------------------------------------
/tests/test_ngramtokenizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from torchFastText.datasets.tokenizer import NGramTokenizer
3 |
4 |
5 | def test_ngramtokenizer_init_valid():
6 | training_text = ["this is a test", "another test sentence"]
7 | tokenizer = NGramTokenizer(
8 | num_tokens=5,
9 | min_count=1,
10 | min_n=2,
11 | max_n=6,
12 | buckets=100,
13 | len_word_ngrams=2,
14 | training_text=training_text,
15 | )
16 | assert tokenizer.min_n == 2
17 | assert tokenizer.max_n == 6
18 | assert tokenizer.word_ngrams == 2
19 | assert tokenizer.nwords == 6 # "this", "is", "a", "test", "another", "sentence"
20 |
21 |
22 | def test_ngramtokenizer_init_min_n_invalid():
23 | training_text = ["this is a test", "another test sentence"]
24 | with pytest.raises(ValueError, match="`min_n` parameter must be greater than 1."):
25 | NGramTokenizer(
26 | num_tokens=5,
27 | min_count=1,
28 | min_n=1,
29 | max_n=6,
30 | buckets=100,
31 | len_word_ngrams=2,
32 | training_text=training_text,
33 | )
34 |
35 |
36 | def test_ngramtokenizer_init_max_n_invalid():
37 | training_text = ["this is a test", "another test sentence"]
38 | with pytest.raises(ValueError, match="`max_n` parameter must be smaller than 7."):
39 | NGramTokenizer(
40 | num_tokens=5,
41 | min_count=1,
42 | min_n=2,
43 | max_n=8,
44 | buckets=100,
45 | len_word_ngrams=2,
46 | training_text=training_text,
47 | )
48 |
49 |
50 | def test_ngramtokenizer_init_min_count():
51 | training_text = ["this is a test", "this is another test"]
52 | tokenizer = NGramTokenizer(
53 | num_tokens=5,
54 | min_count=2,
55 | min_n=2,
56 | max_n=6,
57 | buckets=100,
58 | len_word_ngrams=2,
59 | training_text=training_text,
60 | )
61 | assert tokenizer.nwords == 3 # "this", "is", "test" (appears at least twice)
62 |
63 |
64 | def test_ngramtokenizer_word_id_mapping():
65 | training_text = ["this is a test", "this is another test"]
66 | tokenizer = NGramTokenizer(
67 | num_tokens=5,
68 | min_count=1,
69 | min_n=2,
70 | max_n=6,
71 | buckets=100,
72 | len_word_ngrams=2,
73 | training_text=training_text,
74 | )
75 | expected_mapping = {"this": 1, "is": 2, "a": 3, "test": 4, "another": 5}
76 | assert tokenizer.word_id_mapping == expected_mapping
77 |
78 |
79 | def test_ngramtokenizer_get_ngram_list():
80 | word = "test"
81 | n = 2
82 | ngrams = NGramTokenizer.get_ngram_list(word, n)
83 | print(ngrams)
84 | assert ngrams == ["te", "es", "st"]
85 |
86 |
87 | def test_ngramtokenizer_get_subwords():
88 | training_text = ["this is a test", "this is another test"]
89 | tokenizer = NGramTokenizer(
90 | num_tokens=5,
91 | min_count=1,
92 | min_n=2,
93 | max_n=3,
94 | buckets=100,
95 | len_word_ngrams=2,
96 | training_text=training_text,
97 | )
98 | subwords = tokenizer.get_subwords("this is a test")
99 | print(subwords)
100 | assert subwords[0] == [
101 | "",
116 | "",
130 | ]
131 |
--------------------------------------------------------------------------------
/torchFastText/__init__.py:
--------------------------------------------------------------------------------
1 | from .torchFastText import torchFastText as torchFastText
2 |
--------------------------------------------------------------------------------
/torchFastText/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import NGramTokenizer as NGramTokenizer
2 | from .dataset import FastTextModelDataset as FastTextModelDataset
3 |
--------------------------------------------------------------------------------
/torchFastText/datasets/dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Dataset class for a FastTextModel without the fastText dependency.
3 | """
4 |
5 | import os
6 | import logging
7 | from typing import List
8 |
9 | import torch
10 |
11 | from .tokenizer import NGramTokenizer
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 | logging.basicConfig(
16 | level=logging.INFO,
17 | format="%(asctime)s - %(name)s - %(message)s",
18 | datefmt="%Y-%m-%d %H:%M:%S",
19 | handlers=[logging.StreamHandler()],
20 | )
21 |
22 |
23 | class FastTextModelDataset(torch.utils.data.Dataset):
24 | """
25 | FastTextModelDataset class.
26 | """
27 |
28 | def __init__(
29 | self,
30 | categorical_variables: List[List[int]],
31 | texts: List[str],
32 | tokenizer: NGramTokenizer,
33 | outputs: List[int] = None,
34 | **kwargs,
35 | ):
36 | """
37 | Constructor for the TorchDataset class.
38 |
39 | Args:
40 | categorical_variables (List[List[int]]): The elements of this list
41 | are the values of each categorical variable across the dataset.
42 | text (List[str]): List of text descriptions.
43 | y (List[int]): List of outcomes.
44 | tokenizer (Tokenizer): Tokenizer.
45 | """
46 |
47 | if categorical_variables is not None and len(categorical_variables) != len(texts):
48 | raise ValueError("Categorical variables and texts must have the same length.")
49 |
50 | if outputs is not None and len(outputs) != len(texts):
51 | raise ValueError("Outputs and texts must have the same length.")
52 |
53 | self.categorical_variables = categorical_variables
54 | self.texts = texts
55 | self.outputs = outputs
56 | self.tokenizer = tokenizer
57 |
58 | def __len__(self) -> int:
59 | """
60 | Returns length of the data.
61 |
62 | Returns:
63 | int: Number of observations.
64 | """
65 | return len(self.texts)
66 |
67 | def __str__(self) -> str:
68 | """
69 | Returns description of the Dataset.
70 |
71 | Returns:
72 | str: Description.
73 | """
74 | return f""
75 |
76 | def __getitem__(self, index: int) -> List:
77 | """
78 | Returns observation for a given index.
79 |
80 | Args:
81 | index (int): Index.
82 |
83 | Returns:
84 | List[int, str]: Observation with given index.
85 | """
86 | categorical_variables = (
87 | self.categorical_variables[index] if self.categorical_variables is not None else None
88 | )
89 | text = self.texts[index]
90 |
91 | if self.outputs is not None:
92 | y = self.outputs[index]
93 | return text, categorical_variables, y
94 | else:
95 | return text, categorical_variables
96 |
97 | def collate_fn(self, batch):
98 | """
99 | Efficient batch processing without explicit loops.
100 |
101 | Args:
102 | batch: Data batch.
103 |
104 | Returns:
105 | Tuple[torch.LongTensor]: Observation with given index.
106 | """
107 |
108 | # Unzip the batch in one go using zip(*batch)
109 | if self.outputs is not None:
110 | text, *categorical_vars, y = zip(*batch)
111 | else:
112 | text, *categorical_vars = zip(*batch)
113 |
114 | # Convert text to indices in parallel using map
115 | indices_batch = list(map(lambda x: self.tokenizer.indices_matrix(x)[0], text))
116 |
117 | # Get padding index once
118 | padding_index = self.tokenizer.get_buckets() + self.tokenizer.get_nwords()
119 |
120 | # Pad sequences efficiently
121 | padded_batch = torch.nn.utils.rnn.pad_sequence(
122 | indices_batch,
123 | batch_first=True,
124 | padding_value=padding_index,
125 | )
126 |
127 | # Handle categorical variables efficiently
128 | if self.categorical_variables is not None:
129 | categorical_tensors = torch.stack(
130 | [
131 | torch.tensor(cat_var, dtype=torch.float32)
132 | for cat_var in categorical_vars[
133 | 0
134 | ] # Access first element since zip returns tuple
135 | ]
136 | )
137 | else:
138 | categorical_tensors = torch.empty(
139 | padded_batch.shape[0], 1, dtype=torch.float32, device=padded_batch.device
140 | )
141 |
142 | if self.outputs is not None:
143 | # Convert labels to tensor in one go
144 | y = torch.tensor(y, dtype=torch.long)
145 | return (padded_batch, categorical_tensors, y)
146 | else:
147 | return (padded_batch, categorical_tensors)
148 |
149 | def create_dataloader(
150 | self,
151 | batch_size: int,
152 | shuffle: bool = False,
153 | drop_last: bool = False,
154 | num_workers: int = os.cpu_count() - 1,
155 | pin_memory: bool = True,
156 | persistent_workers: bool = True,
157 | **kwargs,
158 | ) -> torch.utils.data.DataLoader:
159 | """
160 | Creates a Dataloader from the FastTextModelDataset.
161 | Use collate_fn() to tokenize and pad the sequences.
162 |
163 | Args:
164 | batch_size (int): Batch size.
165 | shuffle (bool, optional): Shuffle option. Defaults to False.
166 | drop_last (bool, optional): Drop last option. Defaults to False.
167 | num_workers (int, optional): Number of workers. Defaults to os.cpu_count() - 1.
168 | pin_memory (bool, optional): Set True if working on GPU, False if CPU. Defaults to True.
169 | persistent_workers (bool, optional): Set True for training, False for inference. Defaults to True.
170 | **kwargs: Additional arguments for PyTorch DataLoader.
171 |
172 | Returns:
173 | torch.utils.data.DataLoader: Dataloader.
174 | """
175 |
176 | logger.info(f"Creating DataLoader with {num_workers} workers.")
177 |
178 | return torch.utils.data.DataLoader(
179 | dataset=self,
180 | batch_size=batch_size,
181 | collate_fn=self.collate_fn,
182 | shuffle=shuffle,
183 | drop_last=drop_last,
184 | pin_memory=pin_memory,
185 | num_workers=num_workers,
186 | persistent_workers=persistent_workers,
187 | **kwargs,
188 | )
189 |
--------------------------------------------------------------------------------
/torchFastText/datasets/tokenizer.py:
--------------------------------------------------------------------------------
1 | """
2 | NGramTokenizer class.
3 | """
4 |
5 | import ctypes
6 | import json
7 | from typing import List, Tuple, Type, Dict
8 |
9 | import numpy as np
10 | import torch
11 | from torch import Tensor
12 | from concurrent.futures import ThreadPoolExecutor
13 | from dataclasses import dataclass
14 | from queue import Queue
15 | import multiprocessing
16 |
17 | from ..preprocess import clean_text_feature
18 |
19 |
20 | class NGramTokenizer:
21 | """
22 | NGramTokenizer class.
23 | """
24 |
25 | def __init__(
26 | self,
27 | min_count: int,
28 | min_n: int,
29 | max_n: int,
30 | num_tokens: int,
31 | len_word_ngrams: int,
32 | training_text: List[str],
33 | **kwargs,
34 | ):
35 | """
36 | Constructor for the NGramTokenizer class.
37 |
38 | Args:
39 | min_count (int): Minimum number of times a word has to be
40 | in the training data to be given an embedding.
41 | min_n (int): Minimum length of character n-grams.
42 | max_n (int): Maximum length of character n-grams.
43 | num_tokens (int): Number of rows in the embedding matrix.
44 | word_ngrams (int): Maximum length of word n-grams.
45 | training_text (List[str]): List of training texts.
46 |
47 | Raises:
48 | ValueError: If `min_n` is 1 or smaller.
49 | ValueError: If `max_n` is 7 or higher.
50 | """
51 | if min_n < 2:
52 | raise ValueError("`min_n` parameter must be greater than 1.")
53 | if max_n > 6:
54 | raise ValueError("`max_n` parameter must be smaller than 7.")
55 |
56 | self.min_count = min_count
57 | self.min_n = min_n
58 | self.max_n = max_n
59 | self.num_tokens = num_tokens
60 | self.word_ngrams = len_word_ngrams
61 |
62 | word_counts = {}
63 | for sentence in training_text:
64 | for word in sentence.split(" "):
65 | word_counts[word] = word_counts.setdefault(word, 0) + 1
66 |
67 | self.word_id_mapping = {}
68 | i = 1
69 | for word, counts in word_counts.items():
70 | if word_counts[word] >= min_count:
71 | self.word_id_mapping[word] = i
72 | i += 1
73 | self.nwords = len(self.word_id_mapping)
74 |
75 | self.padding_index = self.num_tokens + self.get_nwords()
76 |
77 | def __str__(self) -> str:
78 | """
79 | Returns description of the NGramTokenizer.
80 |
81 | Returns:
82 | str: Description.
83 | """
84 | return f""
85 |
86 | def get_nwords(self) -> int:
87 | """
88 | Return number of words kept in training data.
89 |
90 | Returns:
91 | int: Number of words.
92 | """
93 | return self.nwords
94 |
95 | def get_buckets(self) -> int:
96 | """
97 | Return number of buckets for tokenizer.
98 |
99 | Returns:
100 | int: Number of buckets.
101 | """
102 | return self.num_tokens
103 |
104 | @staticmethod
105 | def get_ngram_list(word: str, n: int) -> List[str]:
106 | """
107 | Return the list of character n-grams for a word with a
108 | given n.
109 |
110 | Args:
111 | word (str): Word.
112 | n (int): Length of the n-grams.
113 |
114 | Returns:
115 | List[str]: List of character n-grams.
116 | """
117 | return [word[i : i + n] for i in range(len(word) - n + 1)]
118 |
119 | @staticmethod
120 | def get_hash(subword: str) -> int:
121 | """
122 | Return hash for a given subword.
123 |
124 | Args:
125 | subword (str): Character n-gram.
126 |
127 | Returns:
128 | int: Corresponding hash.
129 | """
130 | h = ctypes.c_uint32(2166136261).value
131 | for c in subword:
132 | c = ctypes.c_int8(ord(c)).value
133 | h = ctypes.c_uint32(h ^ c).value
134 | h = ctypes.c_uint32(h * 16777619).value
135 | return h
136 |
137 | @staticmethod
138 | def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int:
139 | """
140 | Get word ngram index in the embedding matrix.
141 |
142 | Args:
143 | hashes (Tuple[int]): Word hashes.
144 | bucket (int): Number of rows in embedding matrix.
145 | nwords (int): Number of words in the vocabulary.
146 |
147 | Returns:
148 | int: Word ngram hash.
149 | """
150 | hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes]
151 | h = ctypes.c_uint64(hashes[0]).value
152 | for j in range(1, len(hashes)):
153 | h = ctypes.c_uint64((h * 116049371)).value
154 | h = ctypes.c_uint64(h + hashes[j]).value
155 | return h % bucket + nwords
156 |
157 | def get_subword_index(self, subword: str) -> int:
158 | """
159 | Return the row index from the embedding matrix which
160 | corresponds to a character n-gram.
161 |
162 | Args:
163 | subword (str): Character n-gram.
164 |
165 | Returns:
166 | int: Index.
167 | """
168 | return self.get_hash(subword) % self.num_tokens + self.nwords
169 |
170 | def get_word_index(self, word: str) -> int:
171 | """
172 | Return the row index from the embedding matrix which
173 | corresponds to a word.
174 |
175 | Args:
176 | word (str): Word.
177 |
178 | Returns:
179 | int: Index.
180 | """
181 | return self.word_id_mapping[word]
182 |
183 | def get_subwords(self, word: str) -> Tuple[List[str], List[int]]:
184 | """
185 | Return all subwords tokens and indices for a given word.
186 | Also adds the whole word token and indice if the word is in word_id_mapping
187 | (==> the word is in initial vocabulary + seen at least MIN_COUNT times).
188 | Adds tags "<" and ">" to the word.
189 |
190 | Args:
191 | word (str): Word.
192 |
193 | Returns:
194 | Tuple[List[str], List[int]]: Tuple of tokens and indices.
195 | """
196 | tokens = []
197 | word_with_tags = "<" + word + ">"
198 |
199 | # Get subwords and associated indices WITHOUT the whole word
200 | for n in range(self.min_n, self.max_n + 1):
201 | ngrams = self.get_ngram_list(word_with_tags, n)
202 | tokens += [
203 | ngram for ngram in ngrams if ngram != word_with_tags and ngram != word
204 | ] # Exclude the full word
205 |
206 | indices = [self.get_subword_index(token) for token in tokens]
207 | assert word not in tokens
208 |
209 | # Add word token and indice only if the word is in word_id_mapping
210 | if word in self.word_id_mapping.keys():
211 | self.get_word_index(word)
212 | tokens = [word] + tokens
213 | indices = [self.get_word_index(word)] + indices
214 |
215 | return (tokens, indices)
216 |
217 | def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]:
218 | """
219 | Returns an array of token indices for a text description.
220 |
221 | Args:
222 | sentence (str): Text description.
223 |
224 | Returns:
225 | tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict)
226 | """
227 | # Pre-split the sentence once
228 | words = sentence.split()
229 | words.append("") # Add end of string token
230 |
231 | indices = []
232 | all_tokens_id = {}
233 |
234 | # Process subwords in one batch
235 | for word in words[:-1]: # Exclude from subword processing
236 | tokens, ind = self.get_subwords(word)
237 | indices.extend(ind)
238 | # Update dictionary with zip for efficiency
239 | all_tokens_id.update(zip(tokens, ind))
240 |
241 | # Add token
242 | indices.append(0)
243 | all_tokens_id[""] = 0
244 |
245 | # Compute word n-grams more efficiently
246 | if self.word_ngrams > 1:
247 | # Pre-compute hashes for all words to avoid repeated computation
248 | word_hashes = [self.get_hash(word) for word in words]
249 |
250 | # Generate n-grams using sliding window
251 | word_ngram_ids = []
252 | for n in range(2, self.word_ngrams + 1):
253 | for i in range(len(words) - n + 1):
254 | # Get slice of hashes for current n-gram
255 | gram_hashes = tuple(word_hashes[i : i + n])
256 |
257 | # Compute n-gram ID
258 | word_ngram_id = int(
259 | self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords)
260 | )
261 |
262 | # Store gram and its ID
263 | gram = " ".join(words[i : i + n])
264 | all_tokens_id[gram] = word_ngram_id
265 | word_ngram_ids.append(word_ngram_id)
266 |
267 | # Extend indices with n-gram IDs
268 | indices.extend(word_ngram_ids)
269 |
270 | # Create reverse mapping once at the end
271 | id_to_token = {v: k for k, v in all_tokens_id.items()}
272 |
273 | # Convert to tensor directly
274 | return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id
275 |
276 | def tokenize(self, text: list[str], text_tokens=True, preprocess=False):
277 | """
278 | Tokenize a list of sentences.
279 |
280 | Args:
281 | text (list[str]): List of sentences.
282 | text_tokens (bool): If True, return tokenized text in tokens.
283 | preprocess (bool): If True, preprocess text. Needs unidecode library.
284 |
285 | Returns:
286 | np.array: Array of indices.
287 | """
288 |
289 | if preprocess:
290 | text = clean_text_feature(text)
291 |
292 | tokenized_text = []
293 | id_to_token_dicts = []
294 | token_to_id_dicts = []
295 | for sentence in text:
296 | all_ind, id_to_token, token_to_id = self.indices_matrix(
297 | sentence
298 | ) # tokenize and convert to token indices
299 | tokenized_text.append(all_ind)
300 | id_to_token_dicts.append(id_to_token)
301 | token_to_id_dicts.append(token_to_id)
302 |
303 | if text_tokens:
304 | tokenized_text_tokens = self._tokenized_text_in_tokens(
305 | tokenized_text, id_to_token_dicts
306 | )
307 | return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts
308 | else:
309 | return tokenized_text, id_to_token_dicts, token_to_id_dicts
310 |
311 | def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts):
312 | """
313 | Convert tokenized text in int format to tokens in str format (given a mapping dictionary).
314 | Private method. Used in tokenizer.tokenize and pytorch_model.predict()
315 |
316 | Args:
317 | tokenized_text (list): List of tokenized text in int format.
318 | id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens.
319 |
320 | Both lists have the same length (number of sentences).
321 |
322 | Returns:
323 | list[list[str]]: List of tokenized text in str format.
324 |
325 | """
326 |
327 | return [
328 | [
329 | id_to_token_dicts[i][token_id.item()]
330 | for token_id in tokenized_sentence
331 | if token_id.item() not in {self.padding_index}
332 | ]
333 | for i, tokenized_sentence in enumerate(tokenized_text)
334 | ]
335 |
336 | def get_vocab(self):
337 | return self.word_id_mapping
338 |
339 | @classmethod
340 | def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer":
341 | """
342 | Load a dataclass instance from a JSON file.
343 | """
344 | with open(filepath, "r") as f:
345 | data = json.load(f)
346 | return cls(**data, training_text=training_text)
347 |
--------------------------------------------------------------------------------
/torchFastText/explainability/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Init script.
3 | """
4 |
--------------------------------------------------------------------------------
/torchFastText/explainability/visualisation.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import seaborn as sns
4 |
5 |
6 | def visualize_word_scores(all_scores, original_texts, pred):
7 | for idx, word_to_score_topk in enumerate(all_scores): # iterate over sentences
8 | all_scores_topk = all_scores[idx]
9 | topk = len(all_scores_topk)
10 | colors = sns.color_palette("mako", n_colors=topk)
11 |
12 | original_words = original_texts[idx].split()
13 | original_words = list(filter(lambda x: x != ",", original_words))
14 |
15 | for i, word in enumerate(original_words):
16 | original_words[i] = word.replace(",", "")
17 |
18 | plt.figure(figsize=(16, 6))
19 | plt.title(
20 | f"Word Scores Visualization for Sentence {idx + 1}", fontsize=20, fontweight="bold"
21 | )
22 |
23 | bar_width = 0.15 # Width of each bar
24 | indices = np.arange(len(original_words))
25 |
26 | for k in range(topk):
27 | scores = all_scores_topk[k]
28 | plt.bar(
29 | indices + k * bar_width,
30 | scores,
31 | bar_width,
32 | color=colors[k % len(colors)],
33 | label=f"{pred[idx][k]}",
34 | )
35 |
36 | # Add labels and legend
37 | plt.xlabel("Words", fontsize=12)
38 | plt.ylabel("Scores", fontsize=12)
39 | plt.xticks(
40 | ticks=indices + bar_width * (topk - 1) / 2,
41 | labels=original_words,
42 | rotation=45,
43 | fontsize=10,
44 | )
45 | plt.ylim(0, 1) # Since scores are between 0 and 1 (softmax output)
46 | plt.legend()
47 |
48 | # Show the plot
49 | plt.tight_layout()
50 | plt.show()
51 |
52 |
53 | def visualize_letter_scores(all_scores_letters, original_texts, pred):
54 | topk = len(all_scores_letters)
55 | for text in original_texts:
56 | text = [text]
57 | all_letters = [list(word) for word in text][0]
58 |
59 | for idx, lett in enumerate(all_letters):
60 | if lett == " ":
61 | all_scores_letters = np.insert(all_scores_letters, idx, 0, axis=2)
62 |
63 | colors = sns.color_palette("mako", n_colors=topk)
64 | scores = all_scores_letters
65 | plt.figure(figsize=(len(all_letters) / 7, 5))
66 |
67 | for k in range(scores.shape[0]):
68 | res = scores[0, k].cpu().numpy()
69 | plt.bar(
70 | range(len(res)),
71 | res,
72 | label=pred[0][k],
73 | width=0.5,
74 | color=colors[k % len(colors)],
75 | )
76 | plt.xticks(range(len(all_letters)), all_letters, rotation=0, fontsize=10)
77 | plt.legend()
78 | plt.show()
79 |
--------------------------------------------------------------------------------
/torchFastText/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .pytorch_model import FastTextModel
2 | from .lightning_module import FastTextModule
3 |
--------------------------------------------------------------------------------
/torchFastText/model/lightning_module.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | import torch
3 | from torchmetrics import Accuracy
4 |
5 | from .pytorch_model import FastTextModel
6 |
7 |
8 | class FastTextModule(pl.LightningModule):
9 | """
10 | Pytorch Lightning Module for FastTextModel.
11 | """
12 |
13 | def __init__(
14 | self,
15 | model: FastTextModel,
16 | loss,
17 | optimizer,
18 | optimizer_params,
19 | scheduler,
20 | scheduler_params,
21 | scheduler_interval="epoch",
22 | **kwargs,
23 | ):
24 | """
25 | Initialize FastTextModule.
26 |
27 | Args:
28 | model: Model.
29 | loss: Loss
30 | optimizer: Optimizer
31 | optimizer_params: Optimizer parameters.
32 | scheduler: Scheduler.
33 | scheduler_params: Scheduler parameters.
34 | scheduler_interval: Scheduler interval.
35 | """
36 | super().__init__()
37 | self.save_hyperparameters(ignore=["model", "loss"])
38 |
39 | self.model = model
40 | self.loss = loss
41 | self.accuracy_fn = Accuracy(task="multiclass", num_classes=self.model.num_classes)
42 | self.optimizer = optimizer
43 | self.optimizer_params = optimizer_params
44 | self.scheduler = scheduler
45 | self.scheduler_params = scheduler_params
46 | self.scheduler_interval = scheduler_interval
47 |
48 | def forward(self, inputs) -> torch.Tensor:
49 | """
50 | Perform forward-pass.
51 |
52 | Args:
53 | batch (List[torch.LongTensor]): Batch to perform forward-pass on.
54 |
55 | Returns (torch.Tensor): Prediction.
56 | """
57 | return self.model(inputs[0], inputs[1])
58 |
59 | def training_step(self, batch, batch_idx: int) -> torch.Tensor:
60 | """
61 | Training step.
62 |
63 | Args:
64 | batch (List[torch.LongTensor]): Training batch.
65 | batch_idx (int): Batch index.
66 |
67 | Returns (torch.Tensor): Loss tensor.
68 | """
69 |
70 | inputs, targets = batch[:-1], batch[-1]
71 | outputs = self.forward(inputs)
72 | loss = self.loss(outputs, targets)
73 | self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True)
74 | accuracy = self.accuracy_fn(outputs, targets)
75 | self.log("train_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True)
76 |
77 | torch.cuda.empty_cache()
78 |
79 | return loss
80 |
81 | def validation_step(self, batch, batch_idx: int):
82 | """
83 | Validation step.
84 |
85 | Args:
86 | batch (List[torch.LongTensor]): Validation batch.
87 | batch_idx (int): Batch index.
88 |
89 | Returns (torch.Tensor): Loss tensor.
90 | """
91 | inputs, targets = batch[:-1], batch[-1]
92 | outputs = self.forward(inputs)
93 | loss = self.loss(outputs, targets)
94 | self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True)
95 |
96 | accuracy = self.accuracy_fn(outputs, targets)
97 | self.log("val_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True)
98 | return loss
99 |
100 | def test_step(self, batch, batch_idx: int):
101 | """
102 | Test step.
103 |
104 | Args:
105 | batch (List[torch.LongTensor]): Test batch.
106 | batch_idx (int): Batch index.
107 |
108 | Returns (torch.Tensor): Loss tensor.
109 | """
110 | inputs, targets = batch[:-1], batch[-1]
111 | outputs = self.forward(inputs)
112 | loss = self.loss(outputs, targets)
113 |
114 | accuracy = self.accuracy_fn(outputs, targets)
115 |
116 | return loss, accuracy
117 |
118 | def configure_optimizers(self):
119 | """
120 | Configure optimizer for Pytorch lighting.
121 |
122 | Returns: Optimizer and scheduler for pytorch lighting.
123 | """
124 | optimizer = self.optimizer(self.parameters(), **self.optimizer_params)
125 | scheduler = self.scheduler(optimizer, **self.scheduler_params)
126 | scheduler = {
127 | "scheduler": scheduler,
128 | "monitor": "val_loss",
129 | "interval": self.scheduler_interval,
130 | }
131 |
132 | return [optimizer], [scheduler]
133 |
--------------------------------------------------------------------------------
/torchFastText/model/losses.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from torch import nn
3 |
4 |
5 | class OneVsAllLoss(nn.Module):
6 | def __init__(self):
7 | super(OneVsAllLoss, self).__init__()
8 |
9 | def forward(self, logits, targets):
10 | """
11 | Compute One-vs-All loss
12 |
13 | Args:
14 | logits: Tensor of shape (batch_size, num_classes) containing classification scores
15 | targets: Tensor of shape (batch_size) containing true class indices
16 |
17 | Returns:
18 | loss: Mean loss value across the batch
19 | """
20 |
21 | num_classes = logits.size(1)
22 |
23 | # Convert targets to one-hot encoding
24 | targets_one_hot = F.one_hot(targets, num_classes=num_classes).float()
25 |
26 | # For each sample, treat the true class as positive and all others as negative
27 | # Using binary cross entropy for each class
28 | loss = F.binary_cross_entropy_with_logits(
29 | logits, # Raw logits
30 | targets_one_hot, # Target probabilities
31 | reduction="none", # Don't reduce yet to allow for custom weighting if needed
32 | )
33 |
34 | # Sum losses across all classes for each sample, then take mean across batch
35 | return loss.sum(dim=1).mean()
36 |
--------------------------------------------------------------------------------
/torchFastText/model/pytorch_model.py:
--------------------------------------------------------------------------------
1 | """
2 | FastText model implemented with Pytorch.
3 | Integrates additional categorical features.
4 | """
5 |
6 | from typing import List, Union
7 | import logging
8 |
9 | import torch
10 |
11 | try:
12 | from captum.attr import LayerIntegratedGradients
13 |
14 | HAS_CAPTUM = True
15 | except ImportError:
16 | HAS_CAPTUM = False
17 |
18 | from torch import nn
19 |
20 | from ..utilities.utils import (
21 | compute_preprocessed_word_score,
22 | compute_word_score,
23 | explain_continuous,
24 | )
25 | from ..utilities.checkers import validate_categorical_inputs
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 | logging.basicConfig(
30 | level=logging.INFO,
31 | format="%(asctime)s - %(name)s - %(message)s",
32 | datefmt="%Y-%m-%d %H:%M:%S",
33 | handlers=[logging.StreamHandler()],
34 | )
35 |
36 |
37 | class FastTextModel(nn.Module):
38 | """
39 | FastText Pytorch Model.
40 | """
41 |
42 | def __init__(
43 | self,
44 | embedding_dim: int,
45 | num_classes: int,
46 | tokenizer=None,
47 | num_rows: int = None,
48 | categorical_vocabulary_sizes: List[int] = None,
49 | categorical_embedding_dims: Union[List[int], int] = None,
50 | padding_idx: int = 0,
51 | sparse: bool = True,
52 | direct_bagging: bool = False,
53 | ):
54 | """
55 | Constructor for the FastTextModel class.
56 |
57 | Args:
58 | embedding_dim (int): Dimension of the text embedding space.
59 | buckets (int): Number of rows in the embedding matrix.
60 | num_classes (int): Number of classes.
61 | categorical_vocabulary_sizes (List[int]): List of the number of
62 | modalities for additional categorical features.
63 | padding_idx (int, optional): Padding index for the text
64 | descriptions. Defaults to 0.
65 | sparse (bool): Indicates if Embedding layer is sparse.
66 | direct_bagging (bool): Use EmbeddingBag instead of Embedding for the text embedding.
67 | """
68 | super(FastTextModel, self).__init__()
69 |
70 | if isinstance(categorical_embedding_dims, int):
71 | self.average_cat_embed = True # if provided categorical embedding dims is an int, average the categorical embeddings before concatenating to sentence embedding
72 | else:
73 | self.average_cat_embed = False
74 |
75 | categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features = (
76 | validate_categorical_inputs(
77 | categorical_vocabulary_sizes,
78 | categorical_embedding_dims,
79 | num_categorical_features=None,
80 | )
81 | )
82 |
83 | assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, (
84 | "categorical_embedding_dims must be a list of int at this stage"
85 | )
86 |
87 | if categorical_embedding_dims is None:
88 | self.average_cat_embed = False
89 |
90 | if tokenizer is None:
91 | if num_rows is None:
92 | raise ValueError(
93 | "Either tokenizer or num_rows must be provided (number of rows in the embedding matrix)."
94 | )
95 | else:
96 | if num_rows is not None:
97 | if num_rows != tokenizer.num_tokens:
98 | logger.warning(
99 | "num_rows is different from the number of tokens in the tokenizer. Using provided num_rows."
100 | )
101 |
102 | self.num_rows = num_rows
103 |
104 | self.num_classes = num_classes
105 | self.padding_idx = padding_idx
106 | self.tokenizer = tokenizer
107 | self.embedding_dim = embedding_dim
108 | self.direct_bagging = direct_bagging
109 | self.sparse = sparse
110 |
111 | self.categorical_embedding_dims = categorical_embedding_dims
112 |
113 | self.embeddings = (
114 | nn.Embedding(
115 | embedding_dim=embedding_dim,
116 | num_embeddings=num_rows,
117 | padding_idx=padding_idx,
118 | sparse=sparse,
119 | )
120 | if not direct_bagging
121 | else nn.EmbeddingBag(
122 | embedding_dim=embedding_dim,
123 | num_embeddings=num_rows,
124 | padding_idx=padding_idx,
125 | sparse=sparse,
126 | mode="mean",
127 | )
128 | )
129 |
130 | self.categorical_embedding_layers = {}
131 |
132 | # Entry dim for the last layer:
133 | # 1. embedding_dim if no categorical variables or summing the categrical embeddings to sentence embedding
134 | # 2. embedding_dim + cat_embedding_dim if averaging the categorical embeddings before concatenating to sentence embedding (categorical_embedding_dims is a int)
135 | # 3. embedding_dim + sum(categorical_embedding_dims) if concatenating individually the categorical embeddings to sentence embedding (no averaging, categorical_embedding_dims is a list)
136 | dim_in_last_layer = embedding_dim
137 | if self.average_cat_embed:
138 | dim_in_last_layer += categorical_embedding_dims[0]
139 |
140 | if categorical_vocabulary_sizes is not None:
141 | self.no_cat_var = False
142 | for var_idx, num_rows in enumerate(categorical_vocabulary_sizes):
143 | if categorical_embedding_dims is not None:
144 | emb = nn.Embedding(
145 | embedding_dim=categorical_embedding_dims[var_idx], num_embeddings=num_rows
146 | ) # concatenate to sentence embedding
147 | if not self.average_cat_embed:
148 | dim_in_last_layer += categorical_embedding_dims[var_idx]
149 | else:
150 | emb = nn.Embedding(
151 | embedding_dim=embedding_dim, num_embeddings=num_rows
152 | ) # sum to sentence embedding
153 | self.categorical_embedding_layers[var_idx] = emb
154 | setattr(self, "emb_{}".format(var_idx), emb)
155 | else:
156 | self.no_cat_var = True
157 |
158 | self.fc = nn.Linear(dim_in_last_layer, num_classes)
159 |
160 | def forward(self, encoded_text: torch.Tensor, additional_inputs: torch.Tensor) -> torch.Tensor:
161 | """
162 | Memory-efficient forward pass implementation.
163 |
164 | Args:
165 | encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text
166 | additional_inputs (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features)
167 |
168 | Returns:
169 | torch.Tensor: Model output scores for each class
170 | """
171 | batch_size = encoded_text.size(0)
172 |
173 | # Ensure correct dtype and device once
174 | if encoded_text.dtype != torch.long:
175 | encoded_text = encoded_text.to(torch.long)
176 |
177 | # Compute text embeddings
178 | if self.direct_bagging:
179 | x_text = self.embeddings(encoded_text) # (batch_size, embedding_dim)
180 | else:
181 | # Compute embeddings and averaging in a memory-efficient way
182 | x_text = self.embeddings(encoded_text) # (batch_size, seq_len, embedding_dim)
183 | # Calculate non-zero tokens mask once
184 | non_zero_mask = (x_text.sum(-1) != 0).float() # (batch_size, seq_len)
185 | token_counts = non_zero_mask.sum(-1, keepdim=True) # (batch_size, 1)
186 |
187 | # Sum and average in place
188 | x_text = (x_text * non_zero_mask.unsqueeze(-1)).sum(
189 | dim=1
190 | ) # (batch_size, embedding_dim)
191 | x_text = torch.div(x_text, token_counts.clamp(min=1.0))
192 | x_text = torch.nan_to_num(x_text, 0.0)
193 |
194 | # Handle categorical variables efficiently
195 | if not self.no_cat_var and additional_inputs.numel() > 0:
196 | cat_embeds = []
197 | # Process categorical embeddings in batch
198 | for i, (_, embed_layer) in enumerate(self.categorical_embedding_layers.items()):
199 | cat_input = additional_inputs[:, i].long()
200 | cat_embed = embed_layer(cat_input)
201 | if cat_embed.dim() > 2:
202 | cat_embed = cat_embed.squeeze(1)
203 | cat_embeds.append(cat_embed)
204 |
205 | if cat_embeds: # If we have categorical embeddings
206 | if self.categorical_embedding_dims is not None:
207 | if self.average_cat_embed:
208 | # Stack and average in one operation
209 | x_cat = torch.stack(cat_embeds, dim=0).mean(dim=0)
210 | x_combined = torch.cat([x_text, x_cat], dim=1)
211 | else:
212 | # Optimize concatenation
213 | x_combined = torch.cat([x_text] + cat_embeds, dim=1)
214 | else:
215 | # Sum embeddings efficiently
216 | x_combined = x_text + torch.stack(cat_embeds, dim=0).sum(dim=0)
217 | else:
218 | x_combined = x_text
219 | else:
220 | x_combined = x_text
221 |
222 | # Final linear layer
223 | return self.fc(x_combined)
224 |
225 | def predict(
226 | self,
227 | text: List[str],
228 | categorical_variables: List[List[int]],
229 | top_k=1,
230 | explain=False,
231 | preprocess=True,
232 | ):
233 | """
234 | Args:
235 | text (List[str]): A list of text observations.
236 | params (Optional[Dict[str, Any]]): Additional parameters to
237 | pass to the model for inference.
238 | top_k (int): for each sentence, return the top_k most likely predictions (default: 1)
239 | explain (bool): launch gradient integration to have an explanation of the prediction (default: False)
240 | preprocess (bool): If True, preprocess text. Needs unidecode library.
241 |
242 | Returns:
243 | if explain is False:
244 | predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query.
245 | confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores.
246 | if explain is True:
247 | predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query.
248 | confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores.
249 | all_attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text.
250 | x (torch.Tensor): A tensor containing the token indices of the text.
251 | id_to_token_dicts (List[Dict[int, str]]): A list of dictionaries mapping token indices to tokens (one for each sentence).
252 | token_to_id_dicts (List[Dict[str, int]]): A list of dictionaries mapping tokens to token indices: the reverse of those in id_to_token_dicts.
253 | text (list[str]): A plist containing the preprocessed text (one line for each sentence).
254 | """
255 |
256 | flag_change_embed = False
257 | if explain:
258 | if not HAS_CAPTUM:
259 | raise ImportError(
260 | "Captum is not installed and is required for explainability. Run 'pip install torchFastText[explainability]'."
261 | )
262 | if self.direct_bagging:
263 | # Get back the classical embedding layer for explainability
264 | new_embed_layer = nn.Embedding(
265 | embedding_dim=self.embedding_dim,
266 | num_embeddings=self.num_rows,
267 | padding_idx=self.padding_idx,
268 | sparse=self.sparse,
269 | )
270 | new_embed_layer.load_state_dict(
271 | self.embeddings.state_dict()
272 | ) # No issues, as exactly the same parameters
273 | self.embeddings = new_embed_layer
274 | self.direct_bagging = (
275 | False # To inform the forward pass that we are not using EmbeddingBag anymore
276 | )
277 | flag_change_embed = True
278 |
279 | lig = LayerIntegratedGradients(
280 | self, self.embeddings
281 | ) # initialize a Captum layer gradient integrator
282 |
283 | self.eval()
284 | batch_size = len(text)
285 |
286 | indices_batch, id_to_token_dicts, token_to_id_dicts = self.tokenizer.tokenize(
287 | text, text_tokens=False, preprocess=preprocess
288 | )
289 |
290 | padding_index = (
291 | self.tokenizer.get_buckets() + self.tokenizer.get_nwords()
292 | ) # padding index, the integer value of the padding token
293 |
294 | padded_batch = torch.nn.utils.rnn.pad_sequence(
295 | indices_batch,
296 | batch_first=True,
297 | padding_value=padding_index,
298 | ) # (batch_size, seq_len) - Tokenized (int) + padded text
299 |
300 | x = padded_batch
301 |
302 | if not self.no_cat_var:
303 | other_features = []
304 | for i, categorical_variable in enumerate(categorical_variables):
305 | other_features.append(
306 | torch.tensor(categorical_variable).reshape(batch_size, -1).to(torch.int64)
307 | )
308 |
309 | other_features = torch.stack(other_features).reshape(batch_size, -1).long()
310 | else:
311 | other_features = torch.empty(batch_size)
312 |
313 | pred = self(
314 | x, other_features
315 | ) # forward pass, contains the prediction scores (len(text), num_classes)
316 | label_scores = pred.detach().cpu()
317 | label_scores_topk = torch.topk(label_scores, k=top_k, dim=1)
318 |
319 | predictions = label_scores_topk.indices # get the top_k most likely predictions
320 | confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores
321 |
322 | if explain:
323 | assert not self.direct_bagging, "Direct bagging should be False for explainability"
324 | all_attributions = []
325 | for k in range(top_k):
326 | attributions = lig.attribute(
327 | (x, other_features), target=torch.Tensor(predictions[:, k]).long()
328 | ) # (batch_size, seq_len)
329 | attributions = attributions.sum(dim=-1)
330 | all_attributions.append(attributions.detach().cpu())
331 |
332 | all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len)
333 |
334 | # Get back to initial embedding layer:
335 | # EmbeddingBag -> Embedding -> EmbeddingBag
336 | # or keep Embedding with no change
337 | if flag_change_embed:
338 | new_embed_layer = nn.EmbeddingBag(
339 | embedding_dim=self.embedding_dim,
340 | num_embeddings=self.num_rows,
341 | padding_idx=self.padding_idx,
342 | sparse=self.sparse,
343 | )
344 | new_embed_layer.load_state_dict(
345 | self.embeddings.state_dict()
346 | ) # No issues, as exactly the same parameters
347 | self.embeddings = new_embed_layer
348 | self.direct_bagging = True
349 | return (
350 | predictions,
351 | confidence,
352 | all_attributions,
353 | x,
354 | id_to_token_dicts,
355 | token_to_id_dicts,
356 | text,
357 | )
358 | else:
359 | return predictions, confidence
360 |
361 | def predict_and_explain(self, text, categorical_variables, top_k=1, n=5, cutoff=0.65):
362 | """
363 | Args:
364 | text (List[str]): A list of sentences.
365 | params (Optional[Dict[str, Any]]): Additional parameters to
366 | pass to the model for inference.
367 | top_k (int): for each sentence, return the top_k most likely predictions (default: 1)
368 | n (int): mapping processed to original words: max number of candidate processed words to consider per original word (default: 5)
369 | cutoff (float): mapping processed to original words: minimum similarity score to consider a candidate processed word (default: 0.75)
370 |
371 | Returns:
372 | predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query.
373 | confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores.
374 | all_scores (List[List[List[float]]]): For each sentence, list of the top_k lists of attributions for each word in the sentence (one for each pred).
375 | """
376 |
377 | # Step 1: Get the predictions, confidence scores and attributions at token level
378 | (
379 | pred,
380 | confidence,
381 | all_attr,
382 | tokenized_text,
383 | id_to_token_dicts,
384 | token_to_id_dicts,
385 | processed_text,
386 | ) = self.predict(
387 | text=text, categorical_variables=categorical_variables, top_k=top_k, explain=True
388 | )
389 |
390 | tokenized_text_tokens = self.tokenizer._tokenized_text_in_tokens(
391 | tokenized_text, id_to_token_dicts
392 | )
393 |
394 | # Step 2: Map the attributions at token level to the processed words
395 | processed_word_to_score_dicts, processed_word_to_token_idx_dicts = (
396 | compute_preprocessed_word_score(
397 | processed_text,
398 | tokenized_text_tokens,
399 | all_attr,
400 | id_to_token_dicts,
401 | token_to_id_dicts,
402 | min_n=self.tokenizer.min_n,
403 | padding_index=self.padding_idx,
404 | end_of_string_index=0,
405 | )
406 | )
407 |
408 | # Step 3: Map the processed words to the original words
409 | all_scores, orig_to_processed_mappings = compute_word_score(
410 | processed_word_to_score_dicts, text, n=n, cutoff=cutoff
411 | )
412 |
413 | # Step 2bis: Get the attributions at letter level
414 | all_scores_letters = explain_continuous(
415 | text,
416 | processed_text,
417 | tokenized_text_tokens,
418 | orig_to_processed_mappings,
419 | processed_word_to_token_idx_dicts,
420 | all_attr,
421 | top_k,
422 | )
423 |
424 | return pred, confidence, all_scores, all_scores_letters
425 |
--------------------------------------------------------------------------------
/torchFastText/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Init script.
3 | """
4 |
5 | from .preprocess import clean_text_feature as clean_text_feature
6 |
--------------------------------------------------------------------------------
/torchFastText/preprocess/preprocess.py:
--------------------------------------------------------------------------------
1 | """
2 | Processing fns.
3 | """
4 |
5 | import string
6 |
7 | import numpy as np
8 |
9 | try:
10 | import nltk
11 | from nltk.corpus import stopwords as ntlk_stopwords
12 | from nltk.stem.snowball import SnowballStemmer
13 |
14 | HAS_NLTK = True
15 | except ImportError:
16 | HAS_NLTK = False
17 |
18 | try:
19 | import unidecode
20 |
21 | HAS_UNIDECODE = True
22 | except ImportError:
23 | HAS_UNIDECODE = False
24 |
25 |
26 | def clean_text_feature(text: list[str], remove_stop_words=True):
27 | """
28 | Cleans a text feature.
29 |
30 | Args:
31 | text (list[str]): List of text descriptions.
32 | remove_stop_words (bool): If True, remove stopwords.
33 |
34 | Returns:
35 | list[str]: List of cleaned text descriptions.
36 |
37 | """
38 | if not HAS_NLTK:
39 | raise ImportError(
40 | "nltk is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'."
41 | )
42 | if not HAS_UNIDECODE:
43 | raise ImportError(
44 | "unidecode is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'."
45 | )
46 |
47 | # Define stopwords and stemmer
48 |
49 | nltk.download("stopwords", quiet=True)
50 | stopwords = tuple(ntlk_stopwords.words("french")) + tuple(string.ascii_lowercase)
51 | stemmer = SnowballStemmer(language="french")
52 |
53 | # Remove of accented characters
54 | text = np.vectorize(unidecode.unidecode)(np.array(text))
55 |
56 | # To lowercase
57 | text = np.char.lower(text)
58 |
59 | # Remove one letter words
60 | def mylambda(x):
61 | return " ".join([w for w in x.split() if len(w) > 1])
62 |
63 | text = np.vectorize(mylambda)(text)
64 |
65 | # Remove duplicate words and stopwords in texts
66 | # Stem words
67 | libs_token = [lib.split() for lib in text.tolist()]
68 | libs_token = [
69 | sorted(set(libs_token[i]), key=libs_token[i].index) for i in range(len(libs_token))
70 | ]
71 | if remove_stop_words:
72 | text = [
73 | " ".join([stemmer.stem(word) for word in libs_token[i] if word not in stopwords])
74 | for i in range(len(libs_token))
75 | ]
76 | else:
77 | text = [
78 | " ".join([stemmer.stem(word) for word in libs_token[i]]) for i in range(len(libs_token))
79 | ]
80 |
81 | # Return clean DataFrame
82 | return text
83 |
--------------------------------------------------------------------------------
/torchFastText/torchFastText.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | import json
4 | from typing import Optional, Union, Type, List
5 | from dataclasses import dataclass, field, asdict
6 |
7 | import numpy as np
8 | import pytorch_lightning as pl
9 | import torch
10 | from pytorch_lightning.callbacks import (
11 | EarlyStopping,
12 | LearningRateMonitor,
13 | ModelCheckpoint,
14 | )
15 | from torch.optim import SGD, Adam
16 |
17 | from .datasets.dataset import FastTextModelDataset
18 | from .datasets.tokenizer import NGramTokenizer
19 | from .model.pytorch_model import FastTextModel
20 | from .model.lightning_module import FastTextModule
21 | from .utilities.checkers import check_X, check_Y, NumpyJSONEncoder
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 | logging.basicConfig(
26 | level=logging.INFO,
27 | format="%(asctime)s - %(name)s - %(message)s",
28 | datefmt="%Y-%m-%d %H:%M:%S",
29 | handlers=[logging.StreamHandler()],
30 | )
31 |
32 |
33 | @dataclass
34 | class torchFastText:
35 | """
36 | The main class for the torchFastText model.
37 |
38 | Args for init:
39 | Architecture-related:
40 | Text-embedding matrix:
41 | num_tokens (int): Number of rows in the embedding matrix (without counting the word-rows)
42 | embedding_dim (int): Dimension of the embedding matrix
43 | sparse (bool): Whether the embedding matrix is sparse
44 | Categorical variables, if any:
45 | categorical_vocabulary_sizes (List[int]): List of the number of unique values for each categorical variable.
46 | - Do not provide if no categorical variables
47 | - Do not provide if you want this to be inferred from X_train in the build method
48 | categorical_embedding_dims (Union[List[int], int]): List of the embedding dimensions for each categorical variable.
49 | - Do not provide if no categorical variables
50 | - Do not provide if there are and the embeddings should be summed to the sentence embedding
51 | - Provide an int if all embeddings should be of the same dimension, averaged and concatened to the sentence embedding
52 | - Provide a list of int if each embedding should be of a different dimension, concatenated without aggregation to the sentence embedding
53 | num_categorical_features (int): Number of categorical variables.
54 | - Not required, should match the length of the above mentioned lists
55 | - Especially useful when you do not provide vocabulary sizes and an int as categorical_embedding_dims
56 |
57 | Tokenizer-related:
58 | min_count (int): Minimum number of times a word has to be in the training data to be given an embedding.
59 | min_n (int): Minimum length of character n-grams.
60 | max_n (int): Maximum length of character n-grams.
61 | len_word_ngrams (int): Maximum length of word n-grams.
62 |
63 | Other attributes, not exposed during initialization:
64 | tokenizer (NGramTokenizer): Tokenizer object, see build_from_tokenizer
65 |
66 |
67 | """
68 |
69 | # Required parameters
70 |
71 | # Embedding matrix
72 | embedding_dim: int
73 | sparse: bool
74 |
75 | # Tokenizer-related
76 | num_tokens: int
77 | min_count: int
78 | min_n: int
79 | max_n: int
80 | len_word_ngrams: int
81 |
82 | # Optional parameters with default values
83 | num_classes: Optional[int] = None
84 | num_rows: Optional[int] = (
85 | None # Default = num_tokens + tokenizer.get_nwords() + 1, but can be customized
86 | )
87 |
88 | # Embedding matrices of categorical variables
89 | categorical_vocabulary_sizes: Optional[List[int]] = None
90 | categorical_embedding_dims: Optional[Union[List[int], int]] = None
91 | num_categorical_features: Optional[int] = None
92 |
93 | # Internal fields (not exposed during initialization)
94 | tokenizer: Optional[NGramTokenizer] = field(init=True, default=None)
95 | pytorch_model: Optional[FastTextModel] = field(init=False, default=None)
96 | lightning_module: Optional[FastTextModule] = field(init=True, default=None)
97 | trained: bool = field(init=False, default=False)
98 |
99 | direct_bagging: Optional[bool] = True # Use nn.EmbeddingBag instead of nn.Embedding
100 |
101 | def _build_pytorch_model(self):
102 | if self.num_rows is None:
103 | if self.tokenizer is None:
104 | raise ValueError(
105 | "Please provide a tokenizer (for instance using model.build_tokenizer()) or num_rows."
106 | )
107 |
108 | else:
109 | self.num_rows = self.tokenizer.padding_index + 1
110 |
111 | else:
112 | if self.tokenizer is not None:
113 | if self.num_rows != self.tokenizer.padding_index + 1:
114 | logger.warning(
115 | f"""Divergent values for num_rows: {self.num_rows} and {self.tokenizer.padding_index + 1} (tokenizer's padding index).
116 | It is set to the max. The padding index will be updated (Always set to num_rows - 1)."""
117 | )
118 | self.num_rows = max(self.num_rows, self.tokenizer.padding_index + 1)
119 |
120 | self.padding_idx = self.num_rows - 1
121 |
122 | # If necessary, update the padding index in the tokenizer so that both match
123 | # Needs to be up to date for tokenizing at inference time (tokenizer.tokenize())
124 | if self.tokenizer is not None and self.padding_idx != self.tokenizer.padding_index:
125 | self.tokenizer.padding_index = self.padding_idx
126 |
127 | self.pytorch_model = FastTextModel(
128 | tokenizer=self.tokenizer,
129 | embedding_dim=self.embedding_dim,
130 | num_rows=self.num_rows,
131 | num_classes=self.num_classes,
132 | categorical_vocabulary_sizes=self.categorical_vocabulary_sizes,
133 | categorical_embedding_dims=self.categorical_embedding_dims,
134 | padding_idx=self.padding_idx,
135 | sparse=self.sparse,
136 | direct_bagging=self.direct_bagging,
137 | )
138 |
139 | def _check_and_init_lightning(
140 | self,
141 | optimizer=None,
142 | optimizer_params=None,
143 | lr=None,
144 | scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
145 | scheduler_params=None,
146 | patience_scheduler=3,
147 | loss=torch.nn.CrossEntropyLoss(),
148 | ):
149 | if optimizer is None:
150 | if lr is None:
151 | raise ValueError("Please provide a learning rate")
152 | self.optimizer = SGD if self.sparse else Adam
153 | self.optimizer_params = {"lr": lr}
154 | else:
155 | self.optimizer = optimizer
156 | if optimizer_params is None:
157 | if lr is not None:
158 | self.optimizer_params = {"lr": lr}
159 | else:
160 | logger.warning(
161 | "No optimizer parameters provided, nor learning rate. Using default parameters"
162 | )
163 | self.optimizer_params = {}
164 |
165 | self.scheduler = scheduler
166 |
167 | if scheduler_params is None:
168 | logger.warning(
169 | "No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau)."
170 | )
171 | self.scheduler_params = {
172 | "mode": "min",
173 | "patience": patience_scheduler,
174 | }
175 | else:
176 | self.scheduler_params = scheduler_params
177 |
178 | self.loss = loss
179 |
180 | self.lightning_module = FastTextModule(
181 | model=self.pytorch_model,
182 | loss=self.loss,
183 | optimizer=self.optimizer,
184 | optimizer_params=self.optimizer_params,
185 | scheduler=self.scheduler,
186 | scheduler_params=self.scheduler_params,
187 | scheduler_interval="epoch",
188 | )
189 |
190 | @classmethod
191 | def build_from_tokenizer(
192 | cls: Type["torchFastText"],
193 | tokenizer: NGramTokenizer,
194 | embedding_dim: int,
195 | num_classes: Optional[int],
196 | categorical_vocabulary_sizes: Optional[List[int]],
197 | sparse: bool = False,
198 | categorical_embedding_dims: Optional[Union[List[int], int]] = None,
199 | num_categorical_features: Optional[int] = None,
200 | lightning=True,
201 | optimizer=None,
202 | optimizer_params: Optional[dict] = None,
203 | lr=None,
204 | scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
205 | scheduler_params: Optional[dict] = None,
206 | patience_scheduler=3,
207 | loss=torch.nn.CrossEntropyLoss(),
208 | ) -> "torchFastText":
209 | """
210 | Alternative constructor that initializes torchFastText from a tokenizer.
211 | Directly builds the PyTorch model and Lightning module (if lightning == True).
212 |
213 | Args:
214 | tokenizer: A NGramTokenizer object that provides min_n, max_n, and other variables.
215 | Refer to the NGramTokenizer, FastTextModule and above constructor for mthe other variables.
216 |
217 | Returns:
218 | torchFastText: An instance of torchFastText initialized using the tokenizer.
219 | """
220 | # Ensure the tokenizer has the required attributes
221 | if not all(
222 | hasattr(tokenizer, attr)
223 | for attr in ["min_count", "min_n", "max_n", "num_tokens", "word_ngrams"]
224 | ):
225 | raise ValueError(f" Attr missing in tokenizer: {tokenizer}")
226 |
227 | # Extract attributes from the tokenizer
228 | min_count = tokenizer.min_count
229 | min_n = tokenizer.min_n
230 | max_n = tokenizer.max_n
231 | num_tokens = tokenizer.num_tokens
232 | len_word_ngrams = tokenizer.word_ngrams
233 |
234 | wrapper = cls(
235 | num_tokens=num_tokens,
236 | embedding_dim=embedding_dim,
237 | min_count=min_count,
238 | min_n=min_n,
239 | max_n=max_n,
240 | len_word_ngrams=len_word_ngrams,
241 | sparse=sparse,
242 | num_classes=num_classes,
243 | categorical_vocabulary_sizes=categorical_vocabulary_sizes,
244 | categorical_embedding_dims=categorical_embedding_dims,
245 | num_categorical_features=num_categorical_features,
246 | tokenizer=tokenizer,
247 | )
248 |
249 | wrapper._build_pytorch_model()
250 |
251 | if lightning:
252 | wrapper._check_and_init_lightning(
253 | optimizer=optimizer,
254 | optimizer_params=optimizer_params,
255 | lr=lr,
256 | scheduler=scheduler,
257 | scheduler_params=scheduler_params,
258 | patience_scheduler=patience_scheduler,
259 | loss=loss,
260 | )
261 | return wrapper
262 |
263 | def build_tokenizer(self, training_text):
264 | self.tokenizer = NGramTokenizer(
265 | self.min_count,
266 | self.min_n,
267 | self.max_n,
268 | self.num_tokens,
269 | self.len_word_ngrams,
270 | training_text,
271 | )
272 |
273 | def build(
274 | self,
275 | X_train: np.ndarray,
276 | y_train: np.ndarray = None,
277 | lightning=True,
278 | optimizer=None,
279 | optimizer_params=None,
280 | lr=None,
281 | scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
282 | scheduler_params=None,
283 | patience_scheduler=3,
284 | loss=torch.nn.CrossEntropyLoss(),
285 | ):
286 | """
287 | Public method that acts as a wrapper to build model, inferring from training data if necessary.
288 |
289 | Args:
290 | X_train (np.ndarray): Array of shape (N,d) with the first column being the text and the rest being the categorical variables.
291 | y_train (np.ndarray): Array of shape (N,) with the labels. Optional if num_classes is provided at initialization (overwrites it otherwise).
292 | lightning (bool): Whether to build the Lightning module. Default is True.
293 | optimizer: Optimizer to use. If None, "lr" must be provided, SGD or Adam will be used whether self.sparse is True or False.
294 | optimizer_params: Dictionary containing optimizer parameters. If None, "lr" will be used as the learning rate.
295 | lr (float): Learning rate. Required if optimizer is None.
296 | scheduler: Scheduler to use. Default is ReduceLROnPlateau.
297 | scheduler_params: Dictionary containing scheduler parameters. Default is {"mode": "min", "patience": patience_scheduler}, well-suited for ReduceLROnPlateau.
298 | patience_scheduler (int): Patience for the scheduler. Default is 3.
299 | loss: Loss function to use. Default is CrossEntropyLoss.
300 | """
301 | training_text, categorical_variables, no_cat_var = check_X(X_train)
302 |
303 | if y_train is not None:
304 | if self.num_classes is not None:
305 | if self.num_classes != len(np.unique(y_train)):
306 | logger.warning(
307 | f"Old num_classes value is {self.num_classes}. New num_classes value is {len(np.unique(y_train))}."
308 | )
309 |
310 | y_train = check_Y(y_train)
311 | self.num_classes = len(
312 | np.unique(y_train)
313 | ) # Be sure that y_train contains all the classes !
314 |
315 | if np.max(y_train) >= self.num_classes:
316 | raise ValueError(
317 | f"y_train must contain values between 0 and {self.num_classes - 1}. Make sure that np.max(y_train) == len(np.unique(y_train))-1."
318 | )
319 |
320 | else:
321 | if self.num_classes is None:
322 | raise ValueError(
323 | "Either num_classes must be provided at init or y_train must be provided here."
324 | )
325 |
326 | if not no_cat_var:
327 | if self.num_categorical_features is not None:
328 | if self.num_categorical_features != categorical_variables.shape[1]:
329 | logger.warning(
330 | f"num_categorical_features: old value is {self.num_categorical_features}. New value is {categorical_variables.shape[1]}."
331 | )
332 |
333 | self.num_categorical_features = categorical_variables.shape[1]
334 |
335 | categorical_vocabulary_sizes = np.max(categorical_variables, axis=0) + 1
336 |
337 | if self.categorical_vocabulary_sizes is not None:
338 | if self.categorical_vocabulary_sizes != list(categorical_vocabulary_sizes):
339 | logger.warning(
340 | "categorical_vocabulary_sizes was provided at initialization. It will be overwritten by the unique values in the training data."
341 | )
342 | self.categorical_vocabulary_sizes = list(categorical_vocabulary_sizes)
343 | else:
344 | if self.categorical_vocabulary_sizes is not None:
345 | logger.warning(
346 | "categorical_vocabulary_sizes was provided at initialization but no categorical variables are provided in X_train. Updating to None."
347 | )
348 | self.categorical_vocabulary_sizes = None
349 | if self.num_categorical_features is not None:
350 | logger.warning(
351 | "num_categorical_features was provided at initialization but no categorical variables are provided in X_train. Updating to None."
352 | )
353 | self.num_categorical_features = None
354 |
355 | self.build_tokenizer(training_text)
356 | self._build_pytorch_model()
357 |
358 | if lightning:
359 | self._check_and_init_lightning(
360 | optimizer=optimizer,
361 | optimizer_params=optimizer_params,
362 | lr=lr,
363 | scheduler=scheduler,
364 | scheduler_params=scheduler_params,
365 | patience_scheduler=patience_scheduler,
366 | loss=loss,
367 | )
368 |
369 | def build_data_loaders(self, X_train, y_train, X_val, y_val, batch_size, num_workers):
370 | """
371 | A public method to build the dataloaders, with few arguments and running checks.
372 |
373 | Args:
374 | X_train (np.ndarray): Array of shape (N,d) with the first column being the text and the rest being the categorical variables.
375 | y_train (np.ndarray): Array of shape (N,) with the labels.
376 | X_val (np.ndarray): Array of shape (N,d) with the first column being the text and the rest being the categorical variables.
377 | y_val (np.ndarray): Array of shape (N,) with the labels.
378 | batch_size (int): Batch size.
379 | num_workers (int): Number of workers for the dataloaders.
380 |
381 | Returns:
382 | Tuple[torch.utils.data.DataLoader]: Training and validation dataloaders.
383 |
384 | """
385 |
386 | training_text, train_categorical_variables, train_no_cat_var = check_X(X_train)
387 | val_text, val_categorical_variables, val_no_cat_var = check_X(X_val)
388 | y_train = check_Y(y_train)
389 | y_val = check_Y(y_val)
390 |
391 | # Datasets and dataloaders
392 | train_dataset = FastTextModelDataset(
393 | categorical_variables=train_categorical_variables,
394 | texts=training_text,
395 | outputs=y_train,
396 | tokenizer=self.tokenizer,
397 | )
398 | val_dataset = FastTextModelDataset(
399 | categorical_variables=val_categorical_variables,
400 | texts=val_text,
401 | outputs=y_val,
402 | tokenizer=self.tokenizer,
403 | )
404 |
405 | train_dataloader = train_dataset.create_dataloader(
406 | batch_size=batch_size, num_workers=num_workers
407 | )
408 | val_dataloader = val_dataset.create_dataloader(
409 | batch_size=batch_size, num_workers=num_workers
410 | )
411 |
412 | return train_dataloader, val_dataloader
413 |
414 | def __build_data_loaders(
415 | self,
416 | train_categorical_variables,
417 | training_text,
418 | y_train,
419 | val_categorical_variables,
420 | val_text,
421 | y_val,
422 | batch_size,
423 | num_workers,
424 | ):
425 | """
426 | A private method to build the dataloaders, without running checks.
427 | Used in train method (where checks are run beforehand).
428 |
429 | Args:
430 | train_categorical_variables (np.ndarray): Array of shape (N_train,d-1) with the categorical variables.
431 | training_text (np.ndarray): Array of shape (N_train,) with the text in string format
432 | y_train (np.ndarray): Array of shape (N_train,) with the labels.
433 | val_categorical_variables (np.ndarray): Array of shape (N_val,d-1) with the categorical variables.
434 | val_text (np.ndarray): Array of shape (N_val,) with the text in string format
435 | y_val (np.ndarray): Array of shape (N_val,) with the labels.
436 | batch_size (int): Batch size.
437 | num_workers (int): Number of workers for the dataloaders.
438 |
439 | Returns:
440 | Tuple[torch.utils.data.DataLoader]: Training and validation dataloaders.
441 | """
442 |
443 | # Datasets and dataloaders
444 | train_dataset = FastTextModelDataset(
445 | categorical_variables=train_categorical_variables,
446 | texts=training_text,
447 | outputs=y_train,
448 | tokenizer=self.tokenizer,
449 | )
450 | val_dataset = FastTextModelDataset(
451 | categorical_variables=val_categorical_variables,
452 | texts=val_text,
453 | outputs=y_val,
454 | tokenizer=self.tokenizer,
455 | )
456 |
457 | train_dataloader = train_dataset.create_dataloader(
458 | batch_size=batch_size, num_workers=num_workers
459 | )
460 | val_dataloader = val_dataset.create_dataloader(
461 | batch_size=batch_size, num_workers=num_workers
462 | )
463 |
464 | return train_dataloader, val_dataloader
465 |
466 | def train(
467 | self,
468 | X_train: np.ndarray,
469 | y_train: np.ndarray,
470 | X_val: np.ndarray,
471 | y_val: np.ndarray,
472 | num_epochs: int,
473 | batch_size: int,
474 | cpu_run: bool = False,
475 | num_workers: int = 12,
476 | optimizer=None,
477 | optimizer_params=None,
478 | lr: float = None,
479 | scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
480 | patience_scheduler: int = 3,
481 | loss=torch.nn.CrossEntropyLoss(),
482 | patience_train=3,
483 | verbose: bool = False,
484 | trainer_params: Optional[dict] = None,
485 | ):
486 | """
487 | Trains the FastText model using the provided training and validation data.
488 | This method checks the provided inputs for consistency and correctness, builds the model if necessary,
489 | creates data loaders for the training and validation datasets, and initiates the training process using a PyTorch Lightning trainer.
490 | After training, it loads the best model checkpoint, moves the model to CPU, and sets the model to evaluation mode.
491 | Parameters:
492 | X_train (np.ndarray): Array containing the training data features.
493 | y_train (np.ndarray): Array containing the training data labels.
494 | X_val (np.ndarray): Array containing the validation data features.
495 | y_val (np.ndarray): Array containing the validation data labels.
496 | num_epochs (int): The maximum number of epochs for the training process.
497 | batch_size (int): The size of the mini-batches used during training.
498 | cpu_run (bool, optional): Whether to force the training to run on the CPU. Defaults to False.
499 | num_workers (int, optional): Number of worker threads for data loading. Defaults to 12.
500 | optimizer (optional): A PyTorch optimizer to use for training. Must be from torch.optim if provided.
501 | optimizer_params (optional): Additional parameters for configuring the optimizer.
502 | lr (float, optional): The learning rate for the optimizer.
503 | scheduler (optional): A PyTorch learning rate scheduler. Defaults to torch.optim.lr_scheduler.ReduceLROnPlateau.
504 | patience_scheduler (int, optional): Number of epochs with no improvement after which learning rate will be reduced. Defaults to 3.
505 | loss (torch.nn.Module, optional): The loss function to optimize. Must be an instance of a PyTorch loss module. Defaults to torch.nn.CrossEntropyLoss().
506 | patience_train (int, optional): Number of epochs with no improvement for early stopping. Defaults to 3.
507 | verbose (bool, optional): Flag to enable verbose logging. Defaults to False.
508 | trainer_params (Optional[dict], optional): Additional parameters to be passed to the PyTorch Lightning Trainer.
509 | Returns:
510 | None
511 | Side Effects:
512 | - Builds the tokenizer and PyTorch model if not already built.
513 | - Creates training and validation data loaders.
514 | - Initiates training using a PyTorch Lightning Trainer.
515 | - Saves the best model checkpoint path in `self.best_model_path`.
516 | - Loads the best model and sets it to evaluation mode.
517 | - Sets `self.trained` to True upon successful training.
518 | Raises:
519 | AssertionError: If provided loss is not a PyTorch loss module, if optimizer is not from torch.optim,
520 | or if scheduler is not a PyTorch learning rate scheduler.
521 | AssertionError: If there is a mismatch in the number of observations or the number of columns between the training and validation datasets.
522 | """
523 | ##### Formatting exception handling #####
524 |
525 | assert isinstance(loss, torch.nn.Module), "loss must be a PyTorch loss function."
526 | assert optimizer is None or optimizer.__module__.startswith("torch.optim"), (
527 | "optimizer must be a PyTorch optimizer."
528 | )
529 | assert scheduler.__module__ == "torch.optim.lr_scheduler", (
530 | "scheduler must be a PyTorch scheduler."
531 | )
532 |
533 | # checking right format for inputs
534 | if verbose:
535 | logger.info("Checking inputs...")
536 |
537 | training_text, train_categorical_variables, train_no_cat_var = check_X(X_train)
538 | val_text, val_categorical_variables, val_no_cat_var = check_X(X_val)
539 | y_train = check_Y(y_train)
540 | y_val = check_Y(y_val)
541 |
542 | # some checks
543 | assert train_no_cat_var == val_no_cat_var, (
544 | "X_train and X_val must have the same number of categorical variables."
545 | )
546 | # shape
547 | assert X_train.shape[0] == y_train.shape[0], (
548 | "X_train and y_train must have the same first dimension (number of observations)."
549 | )
550 | assert X_train.ndim > 1 and X_train.shape[1] == X_val.shape[1] or X_val.ndim == 1, (
551 | "X_train and X_val must have the same number of columns."
552 | )
553 |
554 | self.no_cat_var = train_no_cat_var
555 |
556 | if verbose:
557 | logger.info("Inputs successfully checked. Starting the training process..")
558 |
559 | ######## Starting the training process ########
560 |
561 | # Device
562 | if cpu_run:
563 | self.device = torch.device("cpu")
564 | else:
565 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
566 |
567 | if verbose:
568 | logger.info(f"Running on: {self.device}")
569 |
570 | # Build tokenizer PyTorch model (using training text and categorical variables)
571 | if self.tokenizer is None or self.pytorch_model is None:
572 | if verbose:
573 | start = time.time()
574 | logger.info("Building the model...")
575 | self.build(
576 | X_train,
577 | y_train,
578 | optimizer=optimizer,
579 | optimizer_params=optimizer_params,
580 | lr=lr,
581 | scheduler=scheduler,
582 | patience_scheduler=patience_scheduler,
583 | loss=loss.to(self.device),
584 | )
585 | if verbose:
586 | end = time.time()
587 | logger.info("Model successfully built in {:.2f} seconds.".format(end - start))
588 |
589 | self.pytorch_model = self.pytorch_model.to(self.device)
590 |
591 | # Dataloaders
592 | train_dataloader, val_dataloader = self.__build_data_loaders(
593 | train_categorical_variables=train_categorical_variables,
594 | training_text=training_text,
595 | y_train=y_train,
596 | val_categorical_variables=val_categorical_variables,
597 | val_text=val_text,
598 | y_val=y_val,
599 | batch_size=batch_size,
600 | num_workers=num_workers,
601 | )
602 |
603 | if verbose:
604 | logger.info("Lightning module successfully created.")
605 |
606 | # Trainer callbacks
607 | checkpoints = [
608 | {
609 | "monitor": "val_loss",
610 | "save_top_k": 1,
611 | "save_last": False,
612 | "mode": "min",
613 | }
614 | ]
615 | callbacks = [ModelCheckpoint(**checkpoint) for checkpoint in checkpoints]
616 | callbacks.append(
617 | EarlyStopping(
618 | monitor="val_loss",
619 | patience=patience_train,
620 | mode="min",
621 | )
622 | )
623 | callbacks.append(LearningRateMonitor(logging_interval="step"))
624 |
625 | # Strategy
626 | strategy = "auto"
627 |
628 | train_params = {
629 | "callbacks": callbacks,
630 | "max_epochs": num_epochs,
631 | "num_sanity_val_steps": 2,
632 | "strategy": strategy,
633 | "log_every_n_steps": 1,
634 | "enable_progress_bar": True,
635 | }
636 |
637 | if trainer_params is not None:
638 | train_params = train_params | trainer_params
639 |
640 | # Trainer
641 | self.trainer = pl.Trainer(**train_params)
642 |
643 | torch.cuda.empty_cache()
644 | torch.set_float32_matmul_precision("medium")
645 |
646 | if verbose:
647 | logger.info("Launching training...")
648 | start = time.time()
649 | self.trainer.fit(self.lightning_module, train_dataloader, val_dataloader)
650 | if verbose:
651 | end = time.time()
652 | logger.info("Training done in {:.2f} seconds.".format(end - start))
653 |
654 | # Load best model
655 | self.best_model_path = self.trainer.checkpoint_callback.best_model_path
656 | self.lightning_module = FastTextModule.load_from_checkpoint(
657 | self.best_model_path,
658 | model=self.pytorch_model,
659 | loss=self.loss,
660 | optimizer=self.optimizer,
661 | optimizer_params=self.optimizer_params,
662 | scheduler=self.scheduler,
663 | scheduler_params=self.scheduler_params,
664 | scheduler_interval="epoch",
665 | )
666 | self.pytorch_model = self.lightning_module.model.to("cpu")
667 | self.trained = True
668 | self.pytorch_model.eval()
669 |
670 | def load_from_checkpoint(self, path):
671 | self.lightning_module = FastTextModule.load_from_checkpoint(
672 | path,
673 | model=self.pytorch_model,
674 | loss=self.loss,
675 | optimizer=self.optimizer,
676 | optimizer_params=self.optimizer_params,
677 | scheduler=self.scheduler,
678 | scheduler_params=self.scheduler_params,
679 | scheduler_interval="epoch",
680 | )
681 | self.pytorch_model = self.lightning_module.model
682 | self.tokenizer = self.pytorch_model.tokenizer
683 |
684 | self.sparse = self.pytorch_model.sparse
685 | self.num_tokens = self.tokenizer.num_tokens
686 | self.embedding_dim = self.pytorch_model.embedding_dim
687 | self.num_classes = self.pytorch_model.num_classes
688 | self.min_n = self.tokenizer.min_n
689 | self.max_n = self.tokenizer.max_n
690 | self.len_word_ngrams = self.tokenizer.word_ngrams
691 | self.no_cat_var = self.pytorch_model.no_cat_var
692 |
693 | def validate(self, X, Y, batch_size=256, num_workers=12):
694 | """
695 | Validates the model on the given data.
696 |
697 | Args:
698 | X (np.ndarray): Array of shape (N,d) with the first column being the text and the rest being the categorical variables.
699 | Y (np.ndarray): Array of shape (N,) with the labels.
700 |
701 | Returns:
702 | float: The validation loss.
703 | """
704 |
705 | if not self.trained:
706 | raise Exception("Model must be trained first.")
707 |
708 | # checking right format for inputs
709 | text, categorical_variables, no_cat_var = check_X(X)
710 | y = check_Y(Y)
711 |
712 | if categorical_variables is not None:
713 | if categorical_variables.shape[1] != self.num_categorical_features:
714 | raise Exception(
715 | f"X must have the same number of categorical variables as the training data ({self.num_categorical_features})."
716 | )
717 | else:
718 | assert self.pytorch_model.no_cat_var == True
719 |
720 | self.pytorch_model.to(X.device)
721 |
722 | dataset = FastTextModelDataset(
723 | categorical_variables=categorical_variables,
724 | texts=text,
725 | outputs=y,
726 | tokenizer=self.tokenizer,
727 | )
728 | dataloader = dataset.create_dataloader(batch_size=batch_size, num_workers=num_workers)
729 |
730 | return self.trainer.test(self.pytorch_model, test_dataloaders=dataloader, verbose=False)
731 |
732 | def predict(self, X, top_k=1, preprocess=False, verbose=False):
733 | """
734 | Predicts the "top_k" classes of the input text.
735 |
736 | Args:
737 | X (np.ndarray): Array of shape (N,d) with the first column being the text and the rest being the categorical variables.
738 | top_k (int): Number of classes to predict (by order of confidence).
739 | preprocess (bool): Whether to preprocess the text before predicting.
740 |
741 | Returns:
742 | np.ndarray: Array of shape (N,top_k)
743 | """
744 |
745 | if verbose:
746 | logger.info(
747 | "Preprocessing is set to True. Input text will be preprocessed, requiring NLTK and Unidecode librairies."
748 | if preprocess
749 | else "Preprocessing is set to False. Input text will not be preprocessed and fed as is to the model."
750 | )
751 |
752 | if not self.trained:
753 | raise Exception("Model must be trained first.")
754 |
755 | # checking right format for inputs
756 | text, categorical_variables, no_cat_var = check_X(X)
757 | if categorical_variables is not None:
758 | if categorical_variables.shape[1] != self.num_categorical_features:
759 | raise Exception(
760 | f"X must have the same number of categorical variables as the training data ({self.num_categorical_features})."
761 | )
762 | else:
763 | assert self.pytorch_model.no_cat_var == True
764 |
765 | return self.pytorch_model.predict(
766 | text, categorical_variables, top_k=top_k, preprocess=preprocess
767 | )
768 |
769 | def predict_and_explain(self, X, top_k=1):
770 | if not self.trained:
771 | raise Exception("Model must be trained first.")
772 |
773 | # checking right format for inputs
774 | text, categorical_variables, no_cat_var = check_X(X)
775 | if categorical_variables is not None:
776 | if categorical_variables.shape[1] != self.num_categorical_features:
777 | raise Exception(
778 | f"X must have the same number of categorical variables as the training data ({self.num_categorical_features})."
779 | )
780 | else:
781 | assert self.pytorch_model.no_cat_var == True
782 |
783 | return self.pytorch_model.predict_and_explain(text, categorical_variables, top_k=top_k)
784 |
785 | def to_json(self, filepath: str) -> None:
786 | with open(filepath, "w") as f:
787 | data = asdict(self)
788 |
789 | # Exclude non-serializable fields
790 | data.pop("tokenizer", None)
791 | data.pop("pytorch_model", None)
792 | data.pop("lightning_module", None)
793 |
794 | data.pop("trained", None) # Useless to save
795 |
796 | json.dump(data, f, cls=NumpyJSONEncoder, indent=4)
797 |
798 | @classmethod
799 | def from_json(cls: Type["torchFastText"], filepath: str) -> "torchFastText":
800 | """
801 | Load a dataclass instance from a JSON file.
802 | """
803 | with open(filepath, "r") as f:
804 | data = json.load(f)
805 | return cls(**data)
806 |
807 | def quantize():
808 | # TODO
809 | pass
810 |
811 | def dequantize():
812 | # TODO
813 | pass
814 |
--------------------------------------------------------------------------------
/torchFastText/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Init script.
3 | """
4 |
--------------------------------------------------------------------------------
/torchFastText/utilities/checkers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | from typing import Optional, Union, Type, List
4 |
5 | import numpy as np
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | def check_X(X):
11 | assert isinstance(X, np.ndarray), (
12 | "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables."
13 | )
14 |
15 | try:
16 | if X.ndim > 1:
17 | text = X[:, 0].astype(str)
18 | else:
19 | text = X[:].astype(str)
20 | except ValueError:
21 | logger.error("The first column of X must be castable in string format.")
22 |
23 | if len(X.shape) == 1 or (len(X.shape) == 2 and X.shape[1] == 1):
24 | no_cat_var = True
25 | else:
26 | no_cat_var = False
27 |
28 | if not no_cat_var:
29 | try:
30 | categorical_variables = X[:, 1:].astype(int)
31 | except ValueError:
32 | logger.error(
33 | f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format."
34 | )
35 | else:
36 | categorical_variables = None
37 |
38 | return text, categorical_variables, no_cat_var
39 |
40 |
41 | def check_Y(Y):
42 | assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)."
43 | assert len(Y.shape) == 1 or (len(Y.shape) == 2 and Y.shape[1] == 1), (
44 | "Y must be a numpy array of shape (N,) or (N,1)."
45 | )
46 |
47 | try:
48 | Y = Y.astype(int)
49 | except ValueError:
50 | logger.error("Y must be castable in integer format.")
51 |
52 | return Y
53 |
54 |
55 | def validate_categorical_inputs(
56 | categorical_vocabulary_sizes: List[int],
57 | categorical_embedding_dims: Union[List[int], int],
58 | num_categorical_features: int = None,
59 | ):
60 | if categorical_vocabulary_sizes is None:
61 | logger.warning("No categorical_vocabulary_sizes. It will be inferred later.")
62 | return None, None, None
63 |
64 | else:
65 | if not isinstance(categorical_vocabulary_sizes, list):
66 | raise TypeError("categorical_vocabulary_sizes must be a list of int")
67 |
68 | if isinstance(categorical_embedding_dims, list):
69 | if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims):
70 | raise ValueError(
71 | "Categorical vocabulary sizes and their embedding dimensions must have the same length"
72 | )
73 |
74 | if num_categorical_features is not None:
75 | if len(categorical_vocabulary_sizes) != num_categorical_features:
76 | raise ValueError(
77 | "len(categorical_vocabulary_sizes) must be equal to num_categorical_features"
78 | )
79 | else:
80 | num_categorical_features = len(categorical_vocabulary_sizes)
81 |
82 | assert num_categorical_features is not None, (
83 | "num_categorical_features should be inferred at this point."
84 | )
85 |
86 | # "Transform" embedding dims into a suitable list, or stay None
87 | if categorical_embedding_dims is not None:
88 | if isinstance(categorical_embedding_dims, int):
89 | categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features
90 | elif not isinstance(categorical_embedding_dims, list):
91 | raise TypeError("categorical_embedding_dims must be an int or a list of int")
92 |
93 | assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, (
94 | "categorical_embedding_dims must be a list of int at this point"
95 | )
96 |
97 | return categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features
98 |
99 |
100 | class NumpyJSONEncoder(json.JSONEncoder):
101 | def default(self, obj):
102 | if isinstance(obj, np.integer):
103 | return int(obj)
104 | if isinstance(obj, np.floating):
105 | return float(obj)
106 | if isinstance(obj, np.ndarray):
107 | return obj.tolist()
108 | return super().default(obj)
109 |
--------------------------------------------------------------------------------
/torchFastText/utilities/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions.
3 | """
4 |
5 | import warnings
6 | import difflib
7 | from difflib import SequenceMatcher
8 |
9 | import torch
10 | import torch.nn.functional as F
11 |
12 | from ..preprocess import clean_text_feature
13 |
14 |
15 | def preprocess_token(token):
16 | preprocessed_token = token.replace("", "")
17 | preprocessed_token = preprocessed_token.replace("<", "")
18 | preprocessed_token = preprocessed_token.replace(">", "")
19 |
20 | preprocessed_token = preprocessed_token.split()
21 |
22 | return preprocessed_token
23 |
24 |
25 | def map_processed_to_original(processed_words, original_words, n=1, cutoff=0.9):
26 | """
27 | Map processed words to original words based on similarity scores.
28 |
29 | Args:
30 | processed_words (List[str]): List of processed words.
31 | original_words (List[str]): List of original words.
32 | n (int): Number of closest processed words to consider for a given original word.
33 | cutoff (float): Minimum similarity score for a match.
34 |
35 | Returns:
36 | Dict[str, str]: Mapping from original word to the corresponding closest processed word.
37 | """
38 |
39 | # For each word in the original list, find the n closest matching processed words
40 | word_mapping = {}
41 |
42 | for original_word in original_words:
43 | original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[
44 | 0
45 | ] # Preprocess the original word
46 |
47 | if original_word_prepro == "":
48 | continue
49 |
50 | max_similarity_score = 0
51 | best_processed_word = None
52 | # Calculate the similarity score for each processed word with the current original word
53 | for processed_word in processed_words:
54 | similarity_score = difflib.SequenceMatcher(
55 | None, processed_word, original_word_prepro
56 | ).ratio() # Ratcliff-Obershelp algorithm
57 |
58 | # Only consider matches with similarity above the cutoff
59 | if similarity_score > max_similarity_score and similarity_score >= cutoff:
60 | max_similarity_score = similarity_score
61 | best_processed_word = processed_word
62 |
63 | if best_processed_word is not None:
64 | # original_word = original_word.replace(',', '')
65 | # Add the tuple (list of closest words, list of similarity scores) to the mapping
66 | word_mapping[original_word] = best_processed_word
67 |
68 | return word_mapping
69 |
70 |
71 | def test_end_of_word(all_processed_words, word, target_token, next_token, min_n):
72 | flag = False
73 | if target_token[-1] == ">":
74 | if next_token[0] == "<":
75 | if word in target_token:
76 | flag = True
77 | if word in next_token:
78 | flag = False
79 | if next_token[1] != word[0]:
80 | flag = True
81 | if len(next_token) == min_n:
82 | flag = True
83 | if next_token in all_processed_words:
84 | flag = True
85 |
86 | return flag
87 |
88 |
89 | def match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n):
90 | """
91 | Match words to token indexes in a sentence.
92 |
93 | Args:
94 | sentence (str): Preprocessed sentence.
95 | tokenized_sentence_tokens (List[str]): List of tokenized sentence tokens.
96 |
97 | Returns:
98 | Dict[str, List[int]]: Mapping from word to list of token indexes.
99 |
100 | """
101 |
102 | pointer_token = 0
103 | res = {}
104 | processed_sentence = clean_text_feature([sentence], remove_stop_words=False)[0]
105 | processed_words = processed_sentence.split()
106 | # we know the tokens are in the right order
107 | for index_word, word in enumerate(processed_words):
108 | if word not in res:
109 | res[word] = []
110 |
111 | start = pointer_token
112 |
113 | # while we don't reach the end of the word, get going
114 | while not test_end_of_word(
115 | processed_words,
116 | word,
117 | tokenized_sentence_tokens[pointer_token],
118 | tokenized_sentence_tokens[pointer_token + 1],
119 | min_n=min_n,
120 | ):
121 | pointer_token += 1
122 | if pointer_token == len(tokenized_sentence_tokens) - 1:
123 | warnings.warn("Error in the tokenization of the sentence")
124 | # workaround to avoid error: each word is asociated to regular ranges
125 | chunck = len(tokenized_sentence_tokens) // len(processed_words)
126 | for idx, word in enumerate(processed_words):
127 | res[word] = range(
128 | idx * chunck, min((idx + 1) * chunck, len(tokenized_sentence_tokens))
129 | )
130 | return res
131 |
132 | pointer_token += 1
133 | end = pointer_token
134 |
135 | res[word] += list(range(start, end))
136 |
137 | # here we arrive at the end of the sentence
138 | assert tokenized_sentence_tokens[pointer_token] == ""
139 | end_of_string_position = pointer_token
140 |
141 | # starting word n_gram
142 | pointer_token += 1
143 | while pointer_token < len(tokenized_sentence_tokens):
144 | token = tokenized_sentence_tokens[pointer_token]
145 | for index_word, word in enumerate(processed_sentence.split()):
146 | # now, the condition of matching changes: we need to find the word in the token
147 | if word in token:
148 | res[word].append(pointer_token)
149 | pointer_token += 1
150 |
151 | assert pointer_token == len(tokenized_sentence_tokens)
152 | assert set(sum([v for v in res.values()], [end_of_string_position])) == set(
153 | range(len(tokenized_sentence_tokens))
154 | ), print(
155 | set(range(len(tokenized_sentence_tokens)))
156 | - set(sum([v for v in res.values()], [end_of_string_position]))
157 | ) # verify if all tokens are used
158 |
159 | return res
160 |
161 |
162 | # at text level
163 | def compute_preprocessed_word_score(
164 | preprocessed_text,
165 | tokenized_text_tokens,
166 | scores,
167 | id_to_token_dicts,
168 | token_to_id_dicts,
169 | min_n,
170 | padding_index=2009603,
171 | end_of_string_index=0,
172 | ):
173 | """
174 | Compute preprocessed word scores based on token scores.
175 |
176 | Args:
177 | preprocessed_text (List[str]): List of preprocessed sentences.
178 | tokenized_text (List[List[int]]): For each sentence, list of token IDs.
179 | scores (List[torch.Tensor]): For each sentence, list of token scores.
180 | id_to_token_dicts (List[Dict[int, str]]): For each sentence, mapping from token ID to token in string form.
181 | token_to_id_dicts (List[Dict[str, int]]): For each sentence, mapping from token (string) to token ID.
182 | padding_index (int): Index of padding token.
183 | end_of_string_index (int): Index of end of string token.
184 | aggregate (bool): Whether to aggregate scores at word level (if False, stay at token level).
185 |
186 | Returns:
187 | List[Dict[str, float]]: For each sentence, mapping from preprocessed word to score.
188 | """
189 |
190 | word_to_score_dicts = []
191 | word_to_token_idx_dicts = []
192 |
193 | for idx, sentence in enumerate(preprocessed_text):
194 | tokenized_sentence_tokens = tokenized_text_tokens[idx] # sentence level, List[str]
195 | word_to_token_idx = match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n)
196 | score_sentence_topk = scores[idx] # torch.Tensor, token scores, (top_k, seq_len)
197 |
198 | # Calculate the score for each token and map to words
199 | word_to_score_topk = []
200 | for k in range(len(score_sentence_topk)):
201 | # Initialize word-to-score dictionary with zero values
202 | word_to_score = {word: 0 for word in sentence.split()}
203 |
204 | score_sentence = score_sentence_topk[k]
205 | for word, associated_token_idx in word_to_token_idx.items():
206 | associated_token_idx = torch.tensor(associated_token_idx).int()
207 | word_to_score[word] = torch.sum(score_sentence[associated_token_idx]).item()
208 |
209 | word_to_score_topk.append(word_to_score.copy())
210 |
211 | word_to_score_dicts.append(word_to_score_topk)
212 | word_to_token_idx_dicts.append(word_to_token_idx)
213 |
214 | return word_to_score_dicts, word_to_token_idx_dicts
215 |
216 |
217 | def compute_word_score(word_to_score_dicts, text, n=5, cutoff=0.75):
218 | """
219 | Compute word scores based on preprocessed word scores.
220 |
221 | Args:
222 | word_to_score_dicts (List[List[Dict[str, float]]]): For each sentence, list of top_k mappings from preprocessed word to score.
223 | text (List[str]): List of sentences.
224 | n (int): Number of closest preprocessed words to consider for a given original word.
225 | cutoff (float): Minimum similarity score for a match.
226 |
227 | Returns:
228 | List[List[List[float]]]: For each sentence, list of top-k scores for each word.
229 | """
230 |
231 | all_scores_text = []
232 | mappings = []
233 | for idx, word_to_score_topk in enumerate(word_to_score_dicts): # iteration over sentences
234 | all_scores_topk = []
235 | processed_words = list(word_to_score_topk[0].keys())
236 | original_words = text[idx].split()
237 | original_words = list(filter(lambda x: x != ",", original_words))
238 | mapping = map_processed_to_original(
239 | processed_words, original_words, n=n, cutoff=cutoff
240 | ) # Dict[str, Tuple[List[str], List[float]]]
241 | mappings.append(mapping)
242 | for word_to_score in word_to_score_topk: # iteration over top_k (the preds)
243 | scores = []
244 | stopwords_idx = []
245 | for pos_word, word in enumerate(original_words):
246 | if word not in mapping:
247 | scores.append(0)
248 | stopwords_idx.append(pos_word)
249 | continue
250 | matching_processed_word = mapping[word]
251 | word_score = word_to_score[matching_processed_word]
252 | scores.append(word_score)
253 |
254 | scores = torch.tensor(scores)
255 | scores = F.softmax(
256 | scores, dim=-1
257 | ) # softmax normalization. Length = len(original_words)
258 | scores[stopwords_idx] = 0
259 |
260 | all_scores_topk.append(scores) # length top_k
261 |
262 | all_scores_text.append(all_scores_topk) # length = len(text)
263 |
264 | return all_scores_text, mappings
265 |
266 |
267 | def explain_continuous(
268 | text, processed_text, tokenized_text_tokens, mappings, word_to_token_idx_dicts, all_attr, top_k
269 | ):
270 | """
271 | Score explanation at letter level.
272 |
273 | Args:
274 | text (List[str]): List of original sentences.
275 | processed_text (List[str]): List of preprocessed sentences.
276 | tokenized_text_tokens (List[List[str]]): List of tokenized sentences.
277 | mappings (List[Dict[str, str]]): List of mappings from original word to preprocessed word.
278 | word_to_token_idx_dicts (List[Dict[str, List[int]]]): List of mappings from preprocessed word to token indexes.
279 | all_attr (torch.Tensor): Tensor of token scores.
280 | top_k (int): Number of top tokens to consider.
281 |
282 | Returns:
283 | List[torch.Tensor]: List of letter scores for each sentence.
284 |
285 |
286 | """
287 | all_scores_text = []
288 | for idx, processed_sentence in enumerate(processed_text):
289 | tokenized_sentence_tokens = tokenized_text_tokens[idx]
290 | mapping = mappings[idx]
291 | word_to_token_idx = word_to_token_idx_dicts[idx]
292 | original_words = text[idx].split()
293 | original_words = list(filter(lambda x: x != ",", original_words))
294 |
295 | original_to_token = {}
296 | original_to_token_idxs = {}
297 |
298 | for original in original_words:
299 | # original = original.replace(',', '')
300 | if original not in mapping:
301 | continue
302 |
303 | matching_processed_word = mapping[original]
304 | associated_token_idx = word_to_token_idx[matching_processed_word]
305 | original_to_token[original] = [
306 | tokenized_sentence_tokens[token_idx] for token_idx in associated_token_idx
307 | ]
308 | original_to_token_idxs[original] = associated_token_idx
309 |
310 | scores_for_k = []
311 | for k in range(top_k):
312 | scores_for_words = []
313 | for xxx, original_word in enumerate(original_words):
314 | original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[
315 | 0
316 | ]
317 |
318 | letters = list(original_word)
319 | scores_letter = torch.zeros(len(letters), dtype=torch.float32)
320 |
321 | if original_word not in original_to_token: # if stopword, 0
322 | scores_for_words.append(scores_letter)
323 | continue
324 |
325 | for pos, token in enumerate(original_to_token[original_word]):
326 | pos_token = original_to_token_idxs[original_word][pos]
327 | # tok = preprocess_token(token)[0]
328 | tok = preprocess_token(token)
329 | score_token = all_attr[idx, k, pos_token].item()
330 |
331 | # Embed the token at the right indexes of the word
332 | sm = SequenceMatcher(None, original_word_prepro, tok)
333 | a, _, size = sm.find_longest_match()
334 | scores_letter[a : a + size] += score_token
335 |
336 | scores_for_words.append(scores_letter)
337 |
338 | all_scores_letter = torch.cat(scores_for_words)
339 | scores = F.softmax(all_scores_letter, dim=-1)
340 | scores[all_scores_letter == 0] = 0
341 | scores_for_k.append(scores)
342 |
343 | scores_for_sentence = torch.stack(scores_for_k)
344 | all_scores_text.append(scores_for_sentence)
345 |
346 | return torch.stack(all_scores_text)
347 |
--------------------------------------------------------------------------------
|