├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yaml
    │   ├── config.yaml
    │   ├── documentation.yaml
    │   └── feature-request.yaml
    ├── pull_request_template.md
    └── workflows
    │   ├── build_and_deploy_documentation.yml
    │   ├── check_arc_runner_env.yml
    │   ├── linting.yml
    │   ├── release_automation.yml
    │   └── tests_full.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG_DEV.md
├── CITATION.cff
├── CONTRIBUTING.md
├── Dataset.md
├── LICENSE
├── MMAP_DATASET_README.md
├── README.md
├── config_files
    ├── data_preparation
    │   ├── packed_cc_en_2048.yaml
    │   └── packed_dataset_config.yaml
    ├── text_generation
    │   └── text_generation_config_torch.yaml
    └── training
    │   ├── config_example_coca.yaml
    │   ├── config_lorem_ipsum_long_fsdp1.yaml
    │   ├── config_lorem_ipsum_long_fsdp1_warmstart.yaml
    │   ├── config_lorem_ipsum_long_fsdp2.yaml
    │   └── config_lorem_ipsum_long_fsdp2_warmstart.yaml
├── data
    ├── checkpoints
    │   └── .gitkeep
    ├── lorem_ipsum.idx
    ├── lorem_ipsum.jsonl
    ├── lorem_ipsum.pbin
    ├── lorem_ipsum_long.idx
    ├── lorem_ipsum_long.jsonl
    ├── lorem_ipsum_long.pbin
    ├── tokenizer
    │   ├── hf_gpt2
    │   │   ├── merges.txt
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.json
    │   │   ├── tokenizer_config.json
    │   │   └── vocab.json
    │   ├── sentencepiece_dclm
    │   │   └── en_32k_tokenizer.model
    │   └── tokenizer_gpt2.json
    └── wiki_data_downloader.sh
├── docs
    ├── Makefile
    ├── components
    │   └── components.md
    ├── dev_ops
    │   ├── release_procedure.md
    │   └── tests.md
    ├── fsdp1_vs_fsdp_2.md
    ├── make.bat
    ├── requirements.txt
    ├── scaling_experiments
    │   ├── scaling_28B_mbs_1_ac_True.png
    │   ├── scaling_leonardo.md
    │   └── scaling_mn5.md
    ├── source
    │   ├── banner.jpg
    │   ├── benchmarking.rst
    │   ├── conf.py
    │   ├── configuration.rst
    │   ├── entrypoints.rst
    │   ├── future_work.rst
    │   ├── index.rst
    │   ├── known_issues.rst
    │   ├── logo.jpg
    │   ├── memmap.rst
    │   ├── model_cards.rst
    │   ├── quickstart.rst
    │   └── vs_code_setup.rst
    └── supported_features.md
├── notebooks
    ├── components.yaml
    ├── redpajama_tokenizer_test.ipynb
    └── tokenizer
    │   ├── redpajama_v2_samples_512_test.idx
    │   ├── redpajama_v2_samples_512_test.pbin
    │   └── unigram_tokenizer.model
├── pyproject.toml
├── scripts
    ├── convco_for_reverts.sh
    └── train.sh
├── src
    └── modalities
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api.py
    │   ├── batch.py
    │   ├── checkpointing
    │       ├── __init__.py
    │       ├── checkpoint_conversion.py
    │       ├── checkpoint_loading.py
    │       ├── checkpoint_saving.py
    │       ├── checkpoint_saving_execution.py
    │       ├── checkpoint_saving_instruction.py
    │       ├── checkpoint_saving_strategies.py
    │       ├── fsdp
    │       │   ├── __init__.py
    │       │   ├── fsdp_checkpoint_loading.py
    │       │   └── fsdp_checkpoint_saving.py
    │       ├── stateful
    │       │   ├── __init__.py
    │       │   ├── app_state.py
    │       │   └── app_state_factory.py
    │       └── torch
    │       │   ├── __init__.py
    │       │   └── torch_checkpoint_loading.py
    │   ├── config
    │       ├── __init__.py
    │       ├── component_factory.py
    │       ├── config.py
    │       ├── instantiation_models.py
    │       ├── lookup_enum.py
    │       ├── pydantic_if_types.py
    │       └── utils.py
    │   ├── conversion
    │       ├── __init__.py
    │       └── gpt2
    │       │   ├── __init__.py
    │       │   ├── configuration_gpt2.py
    │       │   ├── conversion_code.py
    │       │   ├── conversion_model.py
    │       │   ├── conversion_tokenizer.py
    │       │   ├── convert_gpt2.py
    │       │   └── modeling_gpt2.py
    │   ├── dataloader
    │       ├── __init__.py
    │       ├── create_index.py
    │       ├── create_packed_data.py
    │       ├── dataloader.py
    │       ├── dataloader_factory.py
    │       ├── dataset.py
    │       ├── dataset_factory.py
    │       ├── large_file_lines_reader.py
    │       ├── preprocessing
    │       │   ├── __init__.py
    │       │   └── tokenization
    │       │   │   └── tokenized_file_writer.py
    │       └── samplers.py
    │   ├── evaluator.py
    │   ├── exceptions.py
    │   ├── gym.py
    │   ├── inference
    │       ├── __init__.py
    │       ├── inference.py
    │       └── text
    │       │   ├── __init__.py
    │       │   ├── config.py
    │       │   └── inference_component.py
    │   ├── logging_broker
    │       ├── __init__.py
    │       ├── message_broker.py
    │       ├── messages.py
    │       ├── publisher.py
    │       ├── subscriber.py
    │       └── subscriber_impl
    │       │   ├── __init__.py
    │       │   ├── progress_subscriber.py
    │       │   ├── results_subscriber.py
    │       │   └── subscriber_factory.py
    │   ├── loss_functions.py
    │   ├── models
    │       ├── __init__.py
    │       ├── coca
    │       │   ├── __init__.py
    │       │   ├── attention_pooling.py
    │       │   ├── coca_model.py
    │       │   ├── collator.py
    │       │   ├── multi_modal_decoder.py
    │       │   └── text_decoder.py
    │       ├── components
    │       │   ├── __init__.py
    │       │   └── layer_norms.py
    │       ├── gpt2
    │       │   ├── __init__.py
    │       │   ├── collator.py
    │       │   ├── gpt2_model.py
    │       │   └── pretrained_gpt_model.py
    │       ├── huggingface
    │       │   ├── __init__.py
    │       │   └── huggingface_model.py
    │       ├── huggingface_adapters
    │       │   ├── __init__.py
    │       │   └── hf_adapter.py
    │       ├── model.py
    │       ├── model_factory.py
    │       ├── utils.py
    │       └── vision_transformer
    │       │   ├── __init__.py
    │       │   └── vision_transformer_model.py
    │   ├── nn
    │       ├── __init__.py
    │       ├── attention.py
    │       ├── mlp.py
    │       └── model_initialization
    │       │   ├── __init__.py
    │       │   ├── composed_initialization.py
    │       │   ├── initialization_if.py
    │       │   ├── initialization_routines.py
    │       │   └── parameter_name_filters.py
    │   ├── optimizers
    │       ├── __init__.py
    │       ├── lr_schedulers.py
    │       └── optimizer_factory.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       ├── create_chunks.py
    │       └── shuffle_data.py
    │   ├── registry
    │       ├── __init__.py
    │       ├── components.py
    │       └── registry.py
    │   ├── running_env
    │       ├── __init__.py
    │       ├── cuda_env.py
    │       ├── env_utils.py
    │       └── fsdp
    │       │   ├── __init__.py
    │       │   ├── device_mesh.py
    │       │   ├── fsdp_auto_wrapper.py
    │       │   └── reducer.py
    │   ├── tokenization
    │       ├── __init__.py
    │       └── tokenizer_wrapper.py
    │   ├── trainer.py
    │   ├── training
    │       ├── __init__.py
    │       ├── activation_checkpointing.py
    │       ├── gradient_clipping
    │       │   ├── __init__.py
    │       │   ├── fsdp_gradient_clipper.py
    │       │   ├── fsdp_gradient_clipper_config.py
    │       │   └── gradient_clipper.py
    │       └── training_progress.py
    │   ├── util.py
    │   └── utils
    │       ├── __init__.py
    │       ├── logging.py
    │       ├── mfu.py
    │       ├── number_conversion.py
    │       ├── seeding.py
    │       ├── typing.py
    │       └── verify_tokenization_consistency.py
├── tests
    ├── __init__.py
    ├── checkpointing
    │   ├── __init__.py
    │   ├── checkpointing_test_utils.py
    │   ├── configs_for_testing
    │   │   └── gpt2_config_test.yaml
    │   ├── fsdp2_gpt2_config.yaml
    │   ├── gpt2_config.yaml
    │   ├── pytorch
    │   │   ├── __init__.py
    │   │   └── test_torch_checkpoint_loading.py
    │   ├── test_checkpoint_conversion.py
    │   ├── test_checkpoint_execution_functions.py
    │   ├── test_checkpoint_strategies.py
    │   ├── test_fsdp1_to_disc_checkpointing.py
    │   └── test_fsdp2_dcp_checkpoint_loading_and_saving.py
    ├── config
    │   ├── __init__.py
    │   ├── components.py
    │   ├── configs.py
    │   ├── custom_components.py
    │   ├── test_component_factory.py
    │   └── test_configs
    │   │   ├── config_backward_reference.yaml
    │   │   ├── config_forward_reference.yaml
    │   │   ├── config_hierarchical_list_component.yaml
    │   │   ├── config_multiple_top_level_components_with_references.yaml
    │   │   ├── config_non_existing_reference.yaml
    │   │   └── config_single_component.yaml
    ├── conftest.py
    ├── conversion
    │   ├── __init__.py
    │   ├── gpt2
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── helper.py
    │   │   ├── test_conversion_code.py
    │   │   ├── test_conversion_model.py
    │   │   ├── test_conversion_tokenizer.py
    │   │   └── test_convert_gpt2.py
    │   └── test_configs
    │   │   └── gpt2_config_test.yaml
    ├── data
    │   └── datasets
    │   │   ├── danish_test_dataset.jsonl
    │   │   ├── lorem_ipsum_long.idx
    │   │   ├── lorem_ipsum_long.jsonl
    │   │   ├── lorem_ipsum_long.pbin
    │   │   └── lorem_ipsum_without_last_newline.jsonl
    ├── dataloader
    │   ├── __init__.py
    │   ├── distributed
    │   │   ├── dist_dataloader_config_with_shuffling.yaml
    │   │   ├── dist_dataloader_config_with_shuffling_and_skipped_batches.yaml
    │   │   ├── dist_dataloader_config_without_shuffling.yaml
    │   │   └── test_distributed_dataloader.py
    │   ├── dummy_sequential_dataset.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── chunking
    │   │   │   ├── __init__.py
    │   │   │   └── test_create_chunks.py
    │   │   └── tokenization
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenized_file_writer.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── test_distributed_samplers.py
    │   │   └── test_sequential_samplers.py
    │   ├── test_combined_dataset.py
    │   ├── test_dataloader.py
    │   ├── test_dummy_dataset.py
    │   ├── test_end_to_end_indexation_and_tokenization.py
    │   ├── test_large_file_lines_reader.py
    │   ├── test_packed_dataset.py
    │   ├── test_shuffle_tokenized_data.py
    │   └── yaml_configs
    │   │   └── skipped_dataloader.yaml
    ├── end2end_tests
    │   ├── __init__.py
    │   ├── custom_components.py
    │   ├── gpt2_train_num_steps_8.yaml
    │   ├── gpt2_warm_start_from_step_4.yaml
    │   ├── lorem_ipsum.pbin
    │   ├── system_tests
    │   │   ├── __init__.py
    │   │   ├── configs
    │   │   │   ├── fsdp1_gpt2_train_num_steps_8.yaml
    │   │   │   └── fsdp2_gpt2_train_num_steps_8.yaml
    │   │   └── test_fsdp_loss_convergence.py
    │   ├── test_create_filtered_tokenized_dataset.py
    │   ├── test_create_shuffled_dataset_chunk.py
    │   ├── test_create_shuffled_jsonl_dataset_chunk.py
    │   ├── test_fsdp_warmstart.py
    │   ├── test_shuffle_jsonl_data.py
    │   ├── test_shuffle_tokenized_data.py
    │   └── test_utils.py
    ├── fsdp2_parallelization
    │   ├── __init__.py
    │   └── test_full_and_hybrid_sharding.py
    ├── models
    │   ├── __init__.py
    │   ├── coca
    │   │   ├── __init__.py
    │   │   ├── coca_config.yaml
    │   │   ├── test_attention_pooling.py
    │   │   └── test_coca.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   └── test_layer_norms.py
    │   ├── test_causal_self_attention.py
    │   ├── test_hf_adapter.py
    │   ├── test_model_factory.py
    │   └── vision_transformer
    │   │   ├── test_vision_transformer.py
    │   │   └── vision_transformer_config.yaml
    ├── nn
    │   ├── test_attention.py
    │   └── test_mlp.py
    ├── run_all_tests.sh
    ├── run_distributed_tests.sh
    ├── test_evaluator.py
    ├── test_gradient_clipping.py
    ├── test_gym.py
    ├── test_initialization_fsdp1.py
    ├── test_initialization_fsdpx.py
    ├── test_loss_functions.py
    ├── test_lr_scheduler.py
    ├── test_main.py
    ├── test_optimizer_factory.py
    ├── test_rotary_qkv_transform.py
    ├── test_tokenization.py
    ├── test_torch_compile.py
    ├── test_utils.py
    ├── test_weight_tying.py
    ├── test_yaml_configs
    │   ├── coca_config_initialization.yaml
    │   ├── config_lorem_ipsum_fsdp1.yaml
    │   ├── config_lorem_ipsum_fsdp2.yaml
    │   ├── gpt2_config_initialization.yaml
    │   ├── gpt2_config_initialization_fsdp1.yaml
    │   ├── gpt2_config_initialization_fsdp2.yaml
    │   ├── gpt2_config_mfu_fsdp1.yaml
    │   ├── gpt2_config_mfu_fsdp2.yaml
    │   └── gpt2_config_optimizer.yaml
    ├── tests.py
    ├── tmp
    │   └── .gitkeep
    └── utils
    │   ├── __init__.py
    │   ├── test_experiment_id_generation.py
    │   ├── test_mfu.py
    │   ├── test_number_conversion.py
    │   └── test_seeding.py
└── tutorials
    ├── getting_started
        ├── README.md
        ├── checkpoints
        │   └── .gitkeep
        ├── configs
        │   ├── example_config.yaml
        │   ├── example_conversion_config_template.yaml
        │   ├── example_dataset_config_test.yaml
        │   ├── example_dataset_config_train.yaml
        │   └── example_text_generation_config.yaml
        ├── data
        │   ├── mem_map
        │   │   └── .git_keep
        │   └── raw
        │   │   ├── redpajama_v2_samples_512_test.jsonl
        │   │   └── redpajama_v2_samples_512_train.jsonl
        ├── scripts
        │   ├── run_checkpoint_conversion.sh
        │   └── run_getting_started_example.sh
        └── tokenizer
        │   ├── tokenizer.json
        │   └── tokenizer_config.json
    ├── library_usage
        ├── README.md
        ├── config_lorem_ipsum.yaml
        ├── main.py
        ├── run.sh
        └── tokenizer
        │   ├── tokenizer.json
        │   └── tokenizer_config.json
    ├── modalities_in_15_mins
        ├── README.md
        ├── configs
        │   ├── pretraining_config.yaml
        │   └── tokenization_config.yaml
        ├── data
        │   ├── checkpoints
        │   │   └── .gitkeep
        │   ├── preprocessed
        │   │   └── .gitkeep
        │   ├── raw
        │   │   └── .gitkeep
        │   └── tokenizer
        │   │   ├── tokenizer.json
        │   │   └── tokenizer_config.json
        ├── modalities_demo.ipynb
        └── res
        │   ├── banner.jpg
        │   ├── fsdp_bright.svg
        │   ├── modalities_file_format_bright.svg
        │   ├── modalities_indexation_bright.svg
        │   ├── modalities_tokenization_bright.svg
        │   └── notebooks_1.png
    └── warmstart
        ├── README.md
        ├── configs
            ├── pre_training_config.yaml
            ├── tokenization_config_train.yaml
            └── warmstart_config.yaml
        └── scripts
            ├── check_checkpoint_consistency.py
            └── pre_train_and_warmstart.sh


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | parallel = True
 3 | branch = True
 4 | source = src/modalities
 5 | 
 6 | 
 7 | [report]
 8 | exclude_lines =
 9 |     # Exclude lines that match the following patterns
10 |     pragma: no cover
11 |     if __name__ == .__main__.:
12 | 
13 | omit =
14 |     /tmp/*
15 |     /usr/*
16 |     */tests/*
17 |     # Add more patterns if necessary
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yaml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug Report
 2 | description: Submit a bug report to help improve modalities
 3 | labels: [ "bug" ]
 4 | 
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: >
 9 |         #### Before submitting a bug report, please make sure the issue hasn't already been addressed, by searching through [the existing and past issues](https://github.com/Modalities/modalities/issues).
10 | 
11 |   - type: textarea
12 |     id: system-info
13 |     attributes:
14 |       label: System Info
15 |       description: Please share your system info with us.
16 |       placeholder: modalities version, platform, python version, ...
17 |     validations:
18 |       required: true
19 | 
20 |   - type: textarea
21 |     attributes:
22 |       label: 🐛 Describe the bug
23 |       description: |
24 |         Please provide a clear and concise description of what the bug is. If relevant, add a minimal example so that we can reproduce the error by running the code. Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception.
25 |       placeholder: |
26 |         A clear and concise description of what the bug is.
27 | 
28 |         ```python
29 |         # Sample code to reproduce the problem
30 |         ```
31 | 
32 |         ```
33 |         The error message you got, with the full traceback.
34 |         ```
35 |     validations:
36 |       required: true
37 | 
38 |   - type: markdown
39 |     attributes:
40 |       value: >
41 |         Thanks for contributing 🎉!


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yaml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yaml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://modalities.github.io/modalities/
 3 | labels: [ "documentation" ]
 4 | 
 5 | body:
 6 |   - type: textarea
 7 |     attributes:
 8 |       label: 📚 The doc issue
 9 |       description: >
10 |         A clear and concise description of what content in https://modalities.github.io/modalities/ is an issue.
11 |     validations:
12 |       required: true
13 | 
14 |   - type: textarea
15 |     attributes:
16 |       label: Suggest a potential alternative/fix
17 |       description: >
18 |         Tell us how we could improve the documentation in this regard.
19 | 
20 |   - type: markdown
21 |     attributes:
22 |       value: >
23 |         Thanks for contributing 🎉!


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature Request
 2 | description: Submit a proposal/request for a new modalities feature
 3 | labels: [ "feature" ]
 4 | 
 5 | body:
 6 |   - type: textarea
 7 |     id: feature-request
 8 |     validations:
 9 |       required: true
10 |     attributes:
11 |       label: Feature request
12 |       description: |
13 |         A clear and concise description of the feature proposal.
14 | 
15 |   - type: textarea
16 |     id: motivation
17 |     validations:
18 |       required: true
19 |     attributes:
20 |       label: Motivation
21 |       description: |
22 |         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link it here, too.
23 | 
24 |   - type: markdown
25 |     attributes:
26 |       value: >
27 |         Thanks for contributing 🎉!


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # What does this PR do?
 2 | 
 3 | This PR ..
 4 | 
 5 | ## General Changes
 6 | * ..
 7 | 
 8 | ## Breaking Changes
 9 | * .. 
10 | 
11 | ## Checklist before submitting final PR
12 | - [ ] My PR is minimal and addresses one issue in isolation
13 | - [ ] I have merged the latest version of the target branch into this feature branch
14 | - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc.
15 | - [ ] I have run a sample config for model training
16 | - [ ] I have checked that all tests run through (`python tests/tests.py`)
17 | - [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`)


--------------------------------------------------------------------------------
/.github/workflows/build_and_deploy_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: "Build Sphinx Documentation"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - main 
 8 | 
 9 | permissions:
10 |   contents: write
11 | 
12 | jobs:
13 |   docs:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.11"]
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v3
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         sudo apt-get update
27 |         sudo apt-get install git -y
28 |         python -m pip install torch==2.6.0
29 |         python -m pip install --upgrade pip setuptools wheel
30 |         export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
31 |         python -m pip install -e .
32 |         python -m pip install myst-parser
33 |         python -m pip install sphinx-rtd-theme sphinx-autodoc-typehints sphinx-click sphinx-automodapi texext
34 |     - name: "Parse into HTML"
35 |       run: |
36 |         sphinx-apidoc -o docs/source/api src/modalities
37 |         sphinx-build -M html docs/source/ docs/build/
38 |     - name: Deploy to GitHub Pages
39 |       uses: peaceiris/actions-gh-pages@v3
40 |       with:
41 |         publish_branch: gh-pages
42 |         github_token: ${{ secrets.GITHUB_TOKEN }}
43 |         publish_dir: docs/build/html
44 |         force_orphan: true
45 | 
46 |       
47 |     
48 | 


--------------------------------------------------------------------------------
/.github/workflows/check_arc_runner_env.yml:
--------------------------------------------------------------------------------
 1 | name: Check Arc Runner Environment
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - check_env_workflow
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: arc-runner-set
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.11"]
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v3
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Check arc runner environment
23 |       run: |
24 |         nvidia-smi
25 |         echo $CUDA_VISIBLE_DEVICES


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 |   pull_request:
 9 |     types: [review_requested, ready_for_review, auto_merge_enabled]
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.11"]
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v3
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install pre-commit
27 |         pre-commit install
28 |     - name: Analysing the code with pre-commit
29 |       run: |
30 |         pre-commit run --all-files
31 | 


--------------------------------------------------------------------------------
/.github/workflows/tests_full.yml:
--------------------------------------------------------------------------------
 1 | name: Tests Full
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: arc-runner-set
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.11"]
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v3
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install dependencies
22 |       run: |
23 |         sudo apt-get update
24 |         sudo apt-get install curl -y                  # required by coveralls
25 |         sudo apt-get install git -y
26 |         python -m pip install torch==2.6.0
27 |         python -m pip install --upgrade pip setuptools wheel
28 |         export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
29 |         python -m pip install ninja                   # Lowers compilation time of flash attention significantly 
30 |         python -m pip install flash-attn --no-build-isolation
31 |         python -m pip install -e .[tests]
32 |     - name: Run tests
33 |       run: |
34 |         pytest
35 | #        sh tests/run_all_tests.sh 0 1
36 |     - name: Coveralls
37 |       uses: coverallsapp/github-action@v2
38 |       with:
39 |         github-token: ${{ secrets.GITHUB_TOKEN }}
40 | #    - name: Upload coverage data to coveralls.io
41 | #      run: |
42 | #        python -m pip install coveralls[toml]
43 | #        COVERALLS_INPUT=.coverage_reports/.coverage coveralls --service=github
44 | #      env:
45 | #        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
46 | 
47 |         
48 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_install_hook_types:
 2 | - pre-commit
 3 | - prepare-commit-msg
 4 | - commit-msg
 5 | repos:
 6 | - repo: https://github.com/pycqa/isort
 7 |   rev: 5.11.5
 8 |   hooks:
 9 |   - id: isort
10 |     stages: [pre-commit]
11 | - repo: https://github.com/psf/black-pre-commit-mirror
12 |   rev: 23.9.1
13 |   hooks:
14 |   - id: black
15 |     language_version: python3.11
16 |     stages: [pre-commit]
17 | - repo: https://github.com/astral-sh/ruff-pre-commit
18 |   rev: v0.0.278
19 |   hooks:
20 |   - id: ruff
21 |     args: [--fix, --exit-non-zero-on-fix]
22 |     stages: [pre-commit]
23 | - repo: local
24 |   hooks:
25 |   - id: custom-commit-msg
26 |     stages: [prepare-commit-msg]
27 |     name: "Apply conventional commit constraints to default revert message"
28 |     entry: ./scripts/convco_for_reverts.sh
29 |     language: system
30 |     types: [text]
31 | - repo: https://github.com/LuzianHahn/conventional-pre-commit
32 |   rev: v2.4.1
33 |   hooks:
34 |   - id: conventional-pre-commit
35 |     stages: [commit-msg]
36 |     args: [feat, fix, ci, chore, test, refactor, debug, docs, perf, revert]
37 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.10"
 7 | 
 8 | python:
 9 |   install:
10 |     - requirements: docs/requirements.txt
11 | 
12 | sphinx:
13 |   configuration: docs/source/conf.py
14 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: If you use this software, please cite both the article from preferred-citation and the software itself.
 3 | authors:
 4 |   - family-names: Lübbering
 5 |     given-names: Max
 6 |   - family-names: Ali
 7 |     given-names: Mehdi
 8 |   - family-names: Stollenwerk
 9 |     given-names: Felix
10 |   - family-names: Fromm
11 |     given-names: Michael
12 |   - family-names: Weber
13 |     given-names: Alexander Arno
14 |   - family-names: Rutmann
15 |     given-names: Richard
16 | title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.'
17 | version: 0.3.2
18 | url: https://github.com/Modalities/modalities
19 | date-released: '2024-12-02'
20 | preferred-citation:
21 |   authors:
22 |     - family-names: Lübbering
23 |       given-names: Max
24 |     - family-names: Ali
25 |       given-names: Mehdi
26 |     - family-names: Stollenwerk
27 |       given-names: Felix
28 |     - family-names: Fromm
29 |       given-names: Michael
30 |     - family-names: Weber
31 |       given-names: Alexander Arno
32 |     - family-names: Rutmann
33 |       given-names: Richard
34 |   title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.'
35 |   url: https://github.com/Modalities/modalities
36 |   type: generic
37 |   year: '2024'
38 |   conference: {}
39 |   publisher: {}
40 | 


--------------------------------------------------------------------------------
/Dataset.md:
--------------------------------------------------------------------------------
 1 | # MemMap Datasets
 2 | 
 3 | ## MemMapDataset Index Generator
 4 | 
 5 | The `MemMapDataset` requires an index file providing the necessary pointers into the raw data file. The `MemMapDataset` can create the index file lazily, however, it is advised to create it beforehand. This can be done by running
 6 | 
 7 | ```sh
 8 | modalities data create_raw_index <path/to/jsonl/file>
 9 | ```
10 | 
11 | The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via `modalities data create_raw_index --help`.
12 | 
13 | ## Packed Dataset Generator
14 | 
15 | The `PackedMemMapDatasetContinuous` and `PackedMemMapDatasetMegatron` require a packed data file. To create the data file, you first have to generate a `MemMapDataset` index file as described [above](#memmapdataset-index-generator). Assuming the index and raw data are located in the same directory, you can simply execute the following command:
16 | 
17 | ```sh
18 | modalities data pack_encoded_data <path/to/config>
19 | ```
20 | 
21 | The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via `modalities data pack_encoded_data --help`.
22 | 
23 | ### Packed Data Format
24 | 
25 | The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts:
26 | 
27 | header segment | data segment | index segment
28 | 
29 | * **header segment**: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes.
30 | * **data segment**: This section contains a concatenation of all documents in form of 4 bytes sized tokens. 
31 | An end-of-sequence token is placed between consecutive documents.
32 | * **index segment**: This section contains a pickled index which locates the documents inside the data segment.
33 |  The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the 
34 |  corresponding document, e.g., `[(start_doc1, len_doc1), (start_doc2, len_doc2), ....]`.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Modalities Project Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config_files/data_preparation/packed_cc_en_2048.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: /workspaces/modalities/data/cc_en/raw/train.jsonl
 3 |   dst_path: /workspaces/modalities/data/cc_en/processed/train.pbin
 4 |   index_path: /workspaces/modalities/data/cc_en/processed/train.idx
 5 |   jq_pattern: .text
 6 |   num_cpus:  ${node_env:num_cpus}
 7 |   eod_token: <eod>
 8 |   processing_batch_size: 1000
 9 |   raw_samples_queue_size: 300
10 |   processed_samples_queue_size: 300
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_sp_tokenizer
15 |   config:
16 |     tokenizer_model_file: /workspaces/modalities/data/tokenizer/sp_bpe_en/bpe_tokenizer.model
17 |     padding: false
18 |     truncation: false
19 | 


--------------------------------------------------------------------------------
/config_files/data_preparation/packed_dataset_config.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: data/lorem_ipsum.jsonl
 3 |   dst_path: data/lorem_ipsum.pbin
 4 |   index_path: data/lorem_ipsum.idx
 5 |   jq_pattern: .text
 6 |   num_cpus: ${node_env:num_cpus}
 7 |   eod_token: <|endoftext|>
 8 |   processing_batch_size: 10
 9 |   raw_samples_queue_size: 20
10 |   processed_samples_queue_size: 20
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_hf_tokenizer
15 |   config:
16 |     pretrained_model_name_or_path: data/tokenizer/hf_gpt2
17 |     padding: false
18 |     truncation: false
19 | 


--------------------------------------------------------------------------------
/data/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/data/lorem_ipsum.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum.idx


--------------------------------------------------------------------------------
/data/lorem_ipsum.pbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum.pbin


--------------------------------------------------------------------------------
/data/lorem_ipsum_long.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum_long.idx


--------------------------------------------------------------------------------
/data/lorem_ipsum_long.pbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum_long.pbin


--------------------------------------------------------------------------------
/data/tokenizer/hf_gpt2/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "bos_token": "<|endoftext|>",
3 |   "eos_token": "<|endoftext|>",
4 |   "unk_token": "<|endoftext|>"
5 | }
6 | 


--------------------------------------------------------------------------------
/data/tokenizer/hf_gpt2/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "added_tokens_decoder": {
 4 |     "50256": {
 5 |       "content": "<|endoftext|>",
 6 |       "lstrip": false,
 7 |       "normalized": true,
 8 |       "rstrip": false,
 9 |       "single_word": false,
10 |       "special": true
11 |     }
12 |   },
13 |   "bos_token": "<|endoftext|>",
14 |   "clean_up_tokenization_spaces": true,
15 |   "eos_token": "<|endoftext|>",
16 |   "model_max_length": 1024,
17 |   "tokenizer_class": "GPT2Tokenizer",
18 |   "unk_token": "<|endoftext|>"
19 | }
20 | 


--------------------------------------------------------------------------------
/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model


--------------------------------------------------------------------------------
/data/wiki_data_downloader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Create the "data" folder if it doesn't exist
4 | mkdir -p data
5 | 
6 | # Download the files into the "data" folder
7 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowAll.csv
8 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowSep.csv


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/dev_ops/release_procedure.md:
--------------------------------------------------------------------------------
 1 | # Releasing in Modalitites
 2 | This tutorial describes the procedure to release a new version of the Modalities package.
 3 | 
 4 | ## Release Types
 5 | We follow the release types as defined by [Semantic Versioning](https://semver.org/). The version number is defined as `MAJOR.MINOR.PATCH` where:
 6 | - `MAJOR` is incremented when you make incompatible API changes,
 7 | - `MINOR` is incremented when you add functionality in a backwards-compatible manner, and
 8 | - `PATCH` is incremented when you make backwards-compatible bug fixes.
 9 | 
10 | 
11 | ## Releasing a new Modalitites version
12 | 0. Make sure that the main branch is in a clean state. In particular, all tests should pass
13 | 1. Update the version number in the `pyproject.toml` and `CITATION.cff` files.
14 | 2. Commit the version bump via `git commit --no-verify -m "<version number>"`, following the versioning convention **v**MAJOR.MINOR.PATCH (Note the leading v!). 
15 |    The `--no-verify` flag is used to skip the pre-commit hooks.
16 | 3. Run `git push` to push the changes to the remote repository.
17 | 5. Tag the commit with the version number following the convention `git tag <version number>`.
18 | 6. Push the tag to the remote repository using `git push --tags`. Note, this command will push all the tags to the remote repository.
19 |    This command triggers the [CI/CD pipeline](../../.github/workflows/release_automation.yml) to build and deploy the package to the PyPI repository.
20 | 


--------------------------------------------------------------------------------
/docs/dev_ops/tests.md:
--------------------------------------------------------------------------------
 1 | # Testing Modalities
 2 | 
 3 | Modalities has a threefold setup for testing, namely
 4 | 
 5 | * Main tests <br> 
 6 | The main tests comprise CPU, single GPU and multi-GPU tests. The latter ones create a distributed environment internally and allow end2end testing of Modalities. 
 7 | Each of these tests defines its requirements (typically the number of GPUs) in the test and the test will be skipped if the requirements are not met.
 8 | 
 9 | * Torchrun tests <br>
10 | These tests are run from a shell script using torchrun and are typically end2end or at least integration tests. Since we implemented distributed testing using multiprocessing within Modalities, these tests will be integrated into the main tests in the long term. Note that some of the torchrun tests have been already migrated to the main tests. 
11 | 
12 | * Example / Tutorial tests <br>
13 | These tests take an example config (e.g., training config or a warmstart config) and execute it. The test makes sure that the config can be executed without errors. The test does not check the results of the execution, but only that the execution can be completed without errors. The user has to check manually for errors in the output.
14 | 
15 | ## Testing Entry Points
16 | There is a single entrypoint to run all test types specified above. 
17 | For a full specification of the test API run
18 | 
19 | ```bash
20 | cd modalities
21 | python tests/tests.py --help
22 | ```
23 | 
24 | in your command line. 
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==7.1.2
2 | sphinx-rtd-theme==1.3.0rc1
3 | 


--------------------------------------------------------------------------------
/docs/scaling_experiments/scaling_28B_mbs_1_ac_True.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/scaling_experiments/scaling_28B_mbs_1_ac_True.png


--------------------------------------------------------------------------------
/docs/source/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/source/banner.jpg


--------------------------------------------------------------------------------
/docs/source/benchmarking.rst:
--------------------------------------------------------------------------------
1 | Benchmarking
2 | =============================
3 | **EDIT "docs/source/benchmarking.rst" IN ORDER TO MAKE CHANGES HERE**
4 | 


--------------------------------------------------------------------------------
/docs/source/entrypoints.rst:
--------------------------------------------------------------------------------
 1 | .. role:: python(code)
 2 |    :language: python
 3 | 
 4 | .. role:: bash(code)
 5 |    :language: bash
 6 | 
 7 | 
 8 | Entrypoints
 9 | =======================================================
10 | 
11 | We use `click <https://click.palletsprojects.com/en/>`_ as a tool to add new entry points and their CLI arguments.
12 | For this we have a main entry point from which all other entry points are started. 
13 | 
14 | The main entry point is :file:`src/modalities/__main__.py:main()`. 
15 | We register other sub-entrypoints by using our main :python:`click.group`, called :python:`main`, as follows:
16 | 
17 | .. code-block:: python
18 | 
19 |   @main.command(name="my_new_entry_point")
20 | 
21 | 
22 | See the following full example:
23 | 
24 | .. code-block:: python
25 | 
26 |   
27 |   import click
28 |   import click_pathlib
29 |   
30 |   
31 |   @click.group()
32 |   def main() -> None:
33 |       pass
34 |   
35 |   
36 |   config_option = click.option(
37 |       "--config_file_path",
38 |       type=click_pathlib.Path(exists=False),
39 |       required=True,
40 |       help="Path to a file with the YAML config file.",
41 |   )
42 |   
43 |   
44 |   @main.command(name="do_stuff")
45 |   @config_option
46 |   @click.option(
47 |       "--my_cli_argument",
48 |       type=int,
49 |       required=True,
50 |       help="New integer argument",
51 |   )
52 |   def entry_point_do_stuff(config_file_path: Path, my_cli_argument: int):
53 |       print(f"Do stuff with {config_file_path} and {my_cli_argument}...)
54 |       ...
55 |   
56 |   if __name__ == "__main__":
57 |       main()
58 | 
59 | With 
60 |     
61 | .. code-block:: python
62 |     
63 |   [project.scripts]
64 |   modalities = "modalities.__main__:main"
65 | 
66 | in our :file:`pyproject.toml`, we can start only main with :python:`modalities` (which does nothing), or a specific sub-entrypoint e.g. :bash:`modalities do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`.
67 | 
68 | Alternatively, directly use :bash:`src/modalities/__main__.py do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`.
69 | 


--------------------------------------------------------------------------------
/docs/source/future_work.rst:
--------------------------------------------------------------------------------
 1 | Future Work
 2 | =======================================================
 3 | 
 4 | The team is currently working on our already established LLM code base to bring in multi-modality into the mix. This extension will be based on ideas similar to CoCa and/or AudioPaLM, which would enable users to either use different encoders for different modalities in conjunction with a text-based decoder, or use a decoder-only architecture.
 5 | Future modalities other than text can be used, namely,
 6 | 
 7 | * image
 8 | * audio
 9 | * video
10 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Modalities' documentation!
 2 | ======================================================================
 3 | 
 4 | We propose a novel training framework for Multimodal Large Language Models (LLMs) that prioritizes code readability and efficiency. 
 5 | The codebase adheres to the principles of "clean code," minimizing Lines of Code (LoC) while maintaining extensibility.
 6 | A single, comprehensive configuration file enables easy customization of various model and training parameters.
 7 | 
 8 | A key innovation is the adoption of a PyTorch-native training loop integrated with the Fully Sharded Data Parallelism (FSDP) technique.
 9 | FSDP optimizes memory usage and training speed, enhancing scalability for large-scale multimodal models.
10 | By leveraging PyTorch's native capabilities, our framework simplifies the development process and promotes ease of maintenance.
11 | 
12 | The framework's modular design facilitates experimentation with different multimodal architectures and training strategies.
13 | Users can seamlessly integrate diverse datasets and model components, allowing for comprehensive exploration of multimodal learning tasks. 
14 | The combination of clean code, minimal configuration, and PyTorch-native training with FSDP contributes to a user-friendly and efficient platform for developing state-of-the-art multimodal language models.
15 | 
16 | .. note::
17 | 
18 |    This project is under active development.
19 | 
20 | .. toctree::
21 |    :caption: Getting Started
22 | 
23 |    quickstart
24 |    configuration
25 |    model_cards
26 |    benchmarking
27 |    known_issues
28 | 
29 | .. toctree::
30 |    :caption: Datasets
31 | 
32 |    memmap
33 | 
34 | .. toctree::
35 |    :caption: Entrypoints
36 | 
37 |    entrypoints
38 | 
39 | .. toctree::
40 |    :caption: VSCode Setup
41 | 
42 |    vs_code_setup
43 | 
44 | 
45 | .. toctree::
46 |    :caption: Future Work
47 | 
48 |    future_work
49 | 
50 | .. toctree::
51 |    :caption: API
52 | 
53 |    api/modules


--------------------------------------------------------------------------------
/docs/source/known_issues.rst:
--------------------------------------------------------------------------------
1 | Known Issues
2 | ==================================================================
3 | 
4 | `GitHub Issues <https://github.com/Modalities/modalities/issues>`_
5 | 
6 | 1. hardcoded dataset path :file:`/raid/s3/opengptx/mehdi/temp/temp_data/train_text_document.bin` in :file:`config/config.yaml`
7 | 2. Dependency on weights&biases
8 | 


--------------------------------------------------------------------------------
/docs/source/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/source/logo.jpg


--------------------------------------------------------------------------------
/docs/source/memmap.rst:
--------------------------------------------------------------------------------
 1 | .. role:: python(code)
 2 |    :language: python
 3 | 
 4 | .. role:: bash(code)
 5 |    :language: bash
 6 | 
 7 | MemMap Datasets
 8 | ====================================================
 9 | 
10 | MemMapDataset Index Generator
11 | ------------------------------------------------------------------------------
12 | 
13 | The :python:`MemMapDataset` requires an index file providing the necessary pointers into the raw data file. The :python:`MemMapDataset` can create the index file lazily, however, it is advised to create it beforehand. This can be done by running
14 | 
15 | .. code-block:: bash
16 | 
17 |   modalities data create_raw_index <path/to/jsonl/file>
18 | 
19 | The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`modalities data create_raw_index --help`.
20 | 
21 | Packed Dataset Generator
22 | --------------------------------------------------------------------------------
23 | 
24 | The :python:`PackedMemMapDatasetContinuous` and :python:`PackedMemMapDatasetMegatron` require a packed data file. To create the data file, you first have to generate a :python:`MemMapDataset` index file as described `above <memMapDataset-index-generator>`_. Assuming the index and raw data are located in the same directory, you can simply execute the following command:
25 | 
26 | .. code-block:: bash
27 | 
28 |   modalities data pack_encoded_data <path/to/jsonl/file>
29 | 
30 | The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`modalities data pack_encoded_data --help`.
31 | 
32 | Packed Data Format
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts:
36 | 
37 | header segment | data segment | index segment
38 | 
39 | * **header segment**: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes.
40 | * **data segment**: This section contains a concatenation of all documents in form of 4 bytes sized tokens. An end-of-sequence token is placed between consecutive documents.
41 | * **index segment**: This section contains a pickled index which locates the documents inside the data segment. The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the corresponding document, e.g., :python:`[(start_doc1, len_doc1), (start_doc2, len_doc2), ....]`.
42 | 


--------------------------------------------------------------------------------
/docs/source/model_cards.rst:
--------------------------------------------------------------------------------
1 | Model Cards
2 | ====================================================
3 | <TODO>
4 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ====================================================
 3 | 
 4 | Installation
 5 | -----------------------------------------------------
 6 | Setup a conda environment `conda create -n modalities python=3.10 & conda activate modalities` and install the requirements `pip install -e .`.
 7 | 
 8 | Setup Dataset
 9 | -------------------------------------------------
10 | To start a training you need to create memmap dataset out of a jsonl file first, then pack it, then run the training.
11 | 
12 | .. code-block:: bash
13 | 
14 |     # Create memmap dataset from jsonl file.
15 |     modalities data create_raw_index <path/to/jsonl/file>
16 | 
17 |     # Create packed dataset.
18 |     modalities data pack_encoded_data <path/to/jsonl/file>
19 | 
20 | For example, using the lorem ipsum example:
21 | 
22 | .. code-block:: bash
23 | 
24 |     # Create memmap dataset from jsonl file.
25 |     modalities data create_raw_index data/lorem_ipsum.jsonl
26 | 
27 |     # Create packed dataset.
28 |     modalities data pack_encoded_data data/lorem_ipsum.jsonl
29 | 
30 | Training
31 | ----------------------------------------------------
32 | To run a training environment variables in a multi-gpu setting are required.
33 | 
34 | .. code-block:: bash
35 | 
36 |     CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes 1 --nproc_per_node 2 --rdzv-endpoint=0.0.0.0:29502 src/modalities/__main__.py run --config_file_path config_files/config_lorem_ipsum.yaml
37 | 
38 | 4. **Evaluation:**
39 |    WIP add contents
40 | 


--------------------------------------------------------------------------------
/docs/source/vs_code_setup.rst:
--------------------------------------------------------------------------------
 1 | VSCode Setup
 2 | ====================================================
 3 | 
 4 | 
 5 | 
 6 | We recommend a docker environment based on the most recent pytorch e.g.:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |     FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
11 |     RUN apt-get update && apt-get install -y wget openssh-client git-core bash-completion
12 |     RUN wget -O /tmp/git-lfs.deb https://packagecloud.io/github/git-lfs/packages/ubuntu/focal/git-lfs_2.13.3_amd64.deb/download.deb && \
13 |         dpkg -i /tmp/git-lfs.deb && \
14 |         rm /tmp/git-lfs.deb
15 |     RUN echo  'source /usr/share/bash-completion/completions/git' >> ~/.bashrc 
16 |     CMD ["/bin/bash"]
17 | 
18 | This works seamlessly in combination with the VSCode DevContainer extention:
19 | 
20 | .. code-block:: json
21 | 
22 |     {
23 |         "name": "Dev Container",
24 |         "dockerFile": "Dockerfile",
25 |         "runArgs": [
26 |             "--network",
27 |             "host",
28 |             "--gpus",
29 |             "all"
30 |         ],
31 |         "customizations": {
32 |             "vscode": {
33 |                 "settings": {
34 |                     "terminal.integrated.shell.linux": "/bin/bash"
35 |                 },
36 |                 "extensions": [
37 |                     "ms-python.python"
38 |                 ]
39 |             }
40 |         }
41 |     }
42 | 
43 | In VSCode, add this to your :file:`launch.json`:
44 | 
45 | .. code-block:: json
46 | 
47 |     {
48 |         "name": "Torchrun Train and Eval",
49 |         "type": "python",
50 |         "request": "launch",
51 |         "module": "torch.distributed.run",
52 |         "env": {
53 |             "CUDA_VISIBLE_DEVICES": "4,5"
54 |         },
55 |         "args": [
56 |             "--nnodes",
57 |             "1",
58 |             "--nproc_per_node",
59 |             "2",
60 |             "--rdzv-endpoint=0.0.0.0:29503",
61 |             "src/modalities/__main__.py",
62 |             "run",
63 |             "--config_file_path",
64 |             "config_files/config_lorem_ipsum.yaml",
65 |         ],
66 |         "console": "integratedTerminal",
67 |         "justMyCode": true,
68 |         "envFile": "${workspaceFolder}/.env",
69 |         "cwd": "${workspaceFolder}/modalities"
70 |     }
71 | 
72 | 


--------------------------------------------------------------------------------
/notebooks/components.yaml:
--------------------------------------------------------------------------------
 1 | settings:  
 2 |   referencing_keys:
 3 |     sample_key: input_ids
 4 |   training:
 5 |     local_train_micro_batch_size: 8
 6 |     sequence_length: 2048
 7 | 
 8 | tokenizer:
 9 |   component_key: tokenizer
10 |   variant_key: pretrained_sp_tokenizer
11 |   config:
12 |     tokenizer_model_file: /workspaces/modalities/notebooks/tokenizer/unigram_tokenizer.model
13 |     padding: false
14 |     truncation: false
15 | 
16 | train_dataset:  
17 |   component_key: dataset
18 |   variant_key: packed_mem_map_dataset_continuous
19 |   config:
20 |     raw_data_path: /workspaces/modalities/notebooks/tokenizer/redpyjama_v2_default_DE_num_docs_65536.pbin
21 |     block_size: ${settings.training.sequence_length}
22 |     sample_key:  ${settings.referencing_keys.sample_key}
23 | 
24 | val_dataset:  
25 |   component_key: dataset
26 |   variant_key: packed_mem_map_dataset_continuous
27 |   config:
28 |     raw_data_path: /workspaces/modalities/notebooks/tokenizer/redpyjama_v2_default_DE_num_docs_65536.pbin
29 |     block_size: ${settings.training.sequence_length}
30 |     sample_key:  ${settings.referencing_keys.sample_key}


--------------------------------------------------------------------------------
/notebooks/tokenizer/redpajama_v2_samples_512_test.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/redpajama_v2_samples_512_test.idx


--------------------------------------------------------------------------------
/notebooks/tokenizer/redpajama_v2_samples_512_test.pbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/redpajama_v2_samples_512_test.pbin


--------------------------------------------------------------------------------
/notebooks/tokenizer/unigram_tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/unigram_tokenizer.model


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "modalities"
 3 | version = "0.3.2"
 4 | requires-python = ">=3.10,<3.12"
 5 | description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 6 | readme = "README.md"
 7 | dependencies = [
 8 |     "numpy<2.0",
 9 |     "torch==2.6.0",
10 |     "packaging",
11 |     "tqdm",
12 |     "pyyaml",
13 |     "transformers",
14 |     "datasets",
15 |     "protobuf",
16 |     "SentencePiece",
17 |     "rich",
18 |     "omegaconf",
19 |     "pydantic",
20 |     "click",
21 |     "click_pathlib",
22 |     "jq",
23 |     "class_resolver",
24 |     "wandb",
25 |     "einops>=0.7.0",
26 | ]
27 | 
28 | [project.urls]
29 | Homepage = "https://github.com/Modalities/modalities"
30 | Issues = "https://github.com/Modalities/modalities/issues"
31 | 
32 | [project.optional-dependencies]
33 | linting = ["pre-commit"]
34 | tests = ["pytest", "pytest-cov", "debugpy"]
35 | install_helper = ["ninja"]
36 | 
37 | [project.scripts]
38 | modalities = "modalities.__main__:main"
39 | 
40 | [build-system]
41 | requires = ["setuptools >= 61.0.0"]
42 | build-backend = "setuptools.build_meta"
43 | 
44 | [tool.black]
45 | target-version = ["py310"]
46 | line-length = 120
47 | 
48 | [tool.isort]
49 | profile = "black"
50 | line_length = 120
51 | src_paths = ["src", "tests"]
52 | 
53 | [tool.ruff]
54 | line-length = 120
55 | 
56 | [tool.pytest.ini_options]
57 | addopts = "--cov=src --cov-report term --cov-report html"
58 | 
59 | [tool.coverage.run]
60 | branch = true
61 | omit = ["*/src/modalities/dataloader/open_gptx_dataset/*"]
62 | 
63 | [tool.coverage.report]
64 | # Regexes for lines to exclude from consideration
65 | exclude_also = [
66 |     # Don't complain about missing debug-only code:
67 |     "def __repr__",
68 |     "if self\\.debug",
69 | 
70 |     # Don't complain if tests don't hit defensive assertion code:
71 |     "raise AssertionError",
72 |     "raise NotImplementedError",
73 | 
74 |     # Don't complain if non-runnable code isn't run:
75 |     "if 0:",
76 |     "if __name__ == .__main__.:",
77 | 
78 |     # Don't complain about abstract methods, they aren't run:
79 |     "@(abc\\.)?abstractmethod",
80 | ]
81 | 
82 | 
83 | ignore_errors = true
84 | 
85 | [tool.coverage.html]
86 | directory = "coverage_html_report"


--------------------------------------------------------------------------------
/scripts/convco_for_reverts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | commit_msg_file="$1"
4 | default_revert_msg=$(cat "$commit_msg_file")
5 | convco_aligned_revert_msg=$(echo "$default_revert_msg" | sed '1s/^Revert /revert: /')
6 | echo "$convco_aligned_revert_msg" > "$commit_msg_file"
7 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 6 $(which modalities) run --config_file_path ../config_files/config_example_mem_map_dataset.yaml


--------------------------------------------------------------------------------
/src/modalities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/__init__.py


--------------------------------------------------------------------------------
/src/modalities/checkpointing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/__init__.py


--------------------------------------------------------------------------------
/src/modalities/checkpointing/checkpoint_conversion.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from modalities.config.config import load_app_config_dict
 4 | from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter, HFModelAdapterConfig
 5 | 
 6 | 
 7 | class CheckpointConversion:
 8 |     """Class to convert a PyTorch checkpoint to a Hugging Face checkpoint."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         config_file_path: Path,
13 |         output_hf_checkpoint_dir: Path,
14 |     ):
15 |         """
16 |         Initializes the CheckpointConversion object.
17 | 
18 |         Args:
19 |             config_file_path (Path): The path to the configuration file containing the pytorch model configuration.
20 |             output_hf_checkpoint_dir (Path): The path to the output Hugging Face checkpoint directory.
21 | 
22 |         Raises:
23 |             ValueError: If the config_file_path does not exist.
24 | 
25 |         """
26 |         self.output_hf_checkpoint_dir = output_hf_checkpoint_dir
27 |         if not config_file_path.exists():
28 |             raise ValueError(f"Could not find {config_file_path}.")
29 | 
30 |         self.config_dict = load_app_config_dict(config_file_path)
31 | 
32 |     def convert_pytorch_to_hf_checkpoint(self, prediction_key: str) -> HFModelAdapter:
33 |         """
34 |         Converts a PyTorch checkpoint to a Hugging Face checkpoint.
35 | 
36 |         Args:
37 |             prediction_key (str): The prediction key to be used in the HFModelAdapter.
38 | 
39 |         Returns:
40 |             HFModelAdapter: The converted Hugging Face model adapter.
41 | 
42 |         """
43 |         config = HFModelAdapterConfig(config=self.config_dict)
44 |         hf_model = HFModelAdapter(config=config, prediction_key=prediction_key, load_checkpoint=True)
45 |         hf_model.save_pretrained(self.output_hf_checkpoint_dir, safe_serialization=False)
46 |         return hf_model
47 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/checkpoint_loading.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | 
 4 | import torch.nn as nn
 5 | from torch.optim import Optimizer
 6 | 
 7 | from modalities.checkpointing.stateful.app_state import AppState
 8 | 
 9 | 
10 | class DistributedCheckpointLoadingIF(ABC):
11 |     """Distributed checkpoint loading interface for loading PyTorch models and optimizer checkpoints."""
12 | 
13 |     @abstractmethod
14 |     def load_checkpoint_(self, app_state: AppState, checkpoint_dir_path: Path) -> AppState:
15 |         """Loads the distributed checkpoint from the specified directory path into the AppState.
16 | 
17 |         Args:
18 |             app_state (AppState): The application state with the model, optimizer and lr scheduler.
19 |             checkpoint_dir_path (Path): The directory path to the distributed checkpoint.
20 | 
21 |         Raises:
22 |             NotImplementedError: This abstract method is not implemented and should be overridden in a subclass.
23 | 
24 |         Returns:
25 |             AppState: The application state with the loaded checkpoint.
26 |         """
27 |         raise NotImplementedError
28 | 
29 | 
30 | class FSDP1CheckpointLoadingIF(ABC):
31 |     """Checkpoint loading interface for loading PyTorch models and optimizer checkpoints."""
32 | 
33 |     @abstractmethod
34 |     def load_model_checkpoint(self, model: nn.Module, file_path: Path) -> nn.Module:
35 |         """
36 |         Loads a model checkpoint from the specified file path.
37 | 
38 |         Args:
39 |             model (nn.Module): The model to load the checkpoint into.
40 |             file_path (Path): The path to the checkpoint file.
41 | 
42 |         Returns:
43 |             nn.Module: The loaded model with the checkpoint parameters.
44 | 
45 |         Raises:
46 |             NotImplementedError: This abstract method is not implemented and should be overridden in a subclass.
47 |         """
48 |         raise NotImplementedError
49 | 
50 |     @abstractmethod
51 |     def load_optimizer_checkpoint_(
52 |         self,
53 |         optimizer: Optimizer,
54 |         model: nn.Module,
55 |         file_path: Path,
56 |     ):
57 |         """
58 |         Loads an optimizer checkpoint from the specified file path (in-place).
59 | 
60 |         Args:
61 |             optimizer (Optimizer): The optimizer to load the checkpoint into (in-place).
62 |             model (nn.Module): The model associated with the optimizer.
63 |             file_path (Path): The path to the checkpoint file.
64 | 
65 |         Raises:
66 |             NotImplementedError: This abstract method is not implemented and should be overridden in a subclass.
67 |         """
68 |         raise NotImplementedError
69 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/checkpoint_saving.py:
--------------------------------------------------------------------------------
 1 | from modalities.batch import EvaluationResultBatch
 2 | from modalities.checkpointing.checkpoint_saving_execution import CheckpointSavingExecutionABC
 3 | from modalities.checkpointing.checkpoint_saving_strategies import CheckpointSavingStrategyIF
 4 | from modalities.checkpointing.stateful.app_state import AppState
 5 | from modalities.training.training_progress import TrainingProgress
 6 | 
 7 | 
 8 | class CheckpointSaving:
 9 |     """Class for saving checkpoints based on a savig and execution strategy."""
10 | 
11 |     def __init__(
12 |         self,
13 |         checkpoint_saving_strategy: CheckpointSavingStrategyIF,
14 |         checkpoint_saving_execution: CheckpointSavingExecutionABC,
15 |     ):
16 |         """
17 |         Initializes the CheckpointSaving object.
18 | 
19 |         Args:
20 |             checkpoint_saving_strategy (CheckpointSavingStrategyIF): The strategy for saving checkpoints.
21 |             checkpoint_saving_execution (CheckpointSavingExecutionABC): The execution for saving checkpoints.
22 |         """
23 |         self.checkpoint_saving_strategy = checkpoint_saving_strategy
24 |         self.checkpoint_saving_execution = checkpoint_saving_execution
25 | 
26 |     def save_checkpoint(
27 |         self,
28 |         training_progress: TrainingProgress,
29 |         evaluation_result: dict[str, EvaluationResultBatch],
30 |         app_state: AppState,
31 |         early_stoppping_criterion_fulfilled: bool = False,
32 |     ):
33 |         """
34 |         Saves a checkpoint of the model and optimizer.
35 | 
36 |         Args:
37 |             training_progress (TrainingProgress): The training progress.
38 |             evaluation_result (dict[str, EvaluationResultBatch]): The evaluation result.
39 |             app_state (AppState): The application state to be checkpointed.
40 |             early_stoppping_criterion_fulfilled (bool, optional):
41 |             Whether the early stopping criterion is fulfilled. Defaults to False.
42 |         """
43 |         checkpointing_instruction = self.checkpoint_saving_strategy.get_checkpoint_instruction(
44 |             training_progress=training_progress,
45 |             evaluation_result=evaluation_result,
46 |             early_stoppping_criterion_fulfilled=early_stoppping_criterion_fulfilled,
47 |         )
48 | 
49 |         self.checkpoint_saving_execution.run_checkpoint_instruction(
50 |             checkpointing_instruction=checkpointing_instruction,
51 |             training_progress=training_progress,
52 |             app_state=app_state,
53 |         )
54 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/checkpoint_saving_execution.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from modalities.checkpointing.checkpoint_saving_instruction import CheckpointingInstruction
 4 | from modalities.checkpointing.stateful.app_state import AppState
 5 | from modalities.training.training_progress import TrainingProgress
 6 | 
 7 | 
 8 | class CheckpointSavingExecutionABC(ABC):
 9 |     """Abstract class for saving PyTorch model and optimizer checkpoints."""
10 | 
11 |     @abstractmethod
12 |     def _save_checkpoint(self, app_state: AppState, training_progress: TrainingProgress):
13 |         """
14 |         Saves the checkpoint of the model and optimizer.
15 | 
16 |         Args:
17 |             app_state (AppState): The application state to be checkpointed.
18 |             training_progress (TrainingProgress): The training progress.
19 | 
20 |         Raises:
21 |             NotImplementedError: This method is not implemented and should be overridden in a subclass.
22 |         """
23 |         raise NotImplementedError
24 | 
25 |     @abstractmethod
26 |     def _delete_checkpoint(self, training_progress: TrainingProgress):
27 |         """
28 |         Deletes the checkpoint based on the training progress.
29 | 
30 |         Args:
31 |             training_progress (TrainingProgress): The training progress.
32 | 
33 |         Raises:
34 |             NotImplementedError: This abstract method is not implemented and should be overridden in a subclass.
35 |         """
36 |         raise NotImplementedError
37 | 
38 |     def run_checkpoint_instruction(
39 |         self,
40 |         checkpointing_instruction: CheckpointingInstruction,
41 |         training_progress: TrainingProgress,
42 |         app_state: AppState,
43 |     ):
44 |         """
45 |         Runs the checkpoint instruction.
46 | 
47 |         Args:
48 |             checkpointing_instruction (CheckpointingInstruction): The checkpointing instruction.
49 |             training_progress (TrainingProgress): The training progress.
50 |             app_state (AppState): The application state to be checkpointed.
51 |         """
52 |         if checkpointing_instruction.save_current:
53 |             self._save_checkpoint(app_state=app_state, training_progress=training_progress)
54 | 
55 |         for training_progress_to_delete in checkpointing_instruction.checkpoints_to_delete:
56 |             self._delete_checkpoint(training_progress=training_progress_to_delete)
57 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/checkpoint_saving_instruction.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from modalities.training.training_progress import TrainingProgress
 4 | 
 5 | 
 6 | @dataclass
 7 | class CheckpointingInstruction:
 8 |     """
 9 |     Represents a checkpointing instruction (i.e., saving and deleting).
10 | 
11 |     Attributes:
12 |         save_current (bool): Indicates whether to save the current checkpoint.
13 |         checkpoints_to_delete (list[TrainingProgress]): List of checkpoint IDs to delete.
14 |     """
15 | 
16 |     save_current: bool = False
17 |     checkpoints_to_delete: list[TrainingProgress] = field(default_factory=list)
18 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/fsdp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/fsdp/__init__.py


--------------------------------------------------------------------------------
/src/modalities/checkpointing/stateful/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/stateful/__init__.py


--------------------------------------------------------------------------------
/src/modalities/checkpointing/stateful/app_state_factory.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | import torch.distributed as dist
 5 | import torch.nn as nn
 6 | from torch.optim import Optimizer
 7 | from torch.optim.lr_scheduler import LRScheduler
 8 | 
 9 | from modalities.checkpointing.fsdp.fsdp_checkpoint_loading import DCPCheckpointLoading
10 | from modalities.checkpointing.stateful.app_state import AppState
11 | 
12 | 
13 | class AppStateFactory:
14 |     """Factory class to create AppState objects."""
15 | 
16 |     @staticmethod
17 |     def get_raw_app_state(
18 |         model: nn.Module, optimizer: Optimizer, lr_scheduler: Optional[LRScheduler] = None
19 |     ) -> AppState:
20 |         """Creates a new (non-checkpoint loaded) AppState object from an instantiated
21 |         model, optimizer, and optional learning rate scheduler.
22 | 
23 |         Args:
24 |             model (nn.Module): The model can be either a non-sharded model, FSDP1 or FSDP2 model.
25 |             optimizer (Optimizer): The optimizer can be either a non-sharded optimizer, FSDP1 or FSDP2 optimizer.
26 |             lr_scheduler (Optional[LRScheduler], optional): Lr scheduler used during training. Defaults to None.
27 | 
28 |         Returns:
29 |             AppState: The AppState object.
30 |         """
31 |         app_state = AppState(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler)
32 |         return app_state
33 | 
34 |     @staticmethod
35 |     def get_dcp_checkpointed_app_state_(
36 |         raw_app_state: AppState,
37 |         checkpoint_dir_path: Path,
38 |     ) -> AppState:
39 |         """Loads the checkpointed state dict into the raw AppState object
40 |         (i.e., non-checkpoint loaded AppState) in-place.
41 | 
42 |         Args:
43 |             raw_app_state (AppState): The raw AppState object.
44 |             checkpoint_dir_path (Path): The path to the checkpoint directory.
45 | 
46 |         Raises:
47 |             RuntimeError: Raises an error if the state dict has already been loaded.
48 | 
49 |         Returns:
50 |             AppState: The AppState object with the loaded state dict.
51 |         """
52 |         if raw_app_state.is_loaded:
53 |             raise RuntimeError(
54 |                 "Cannot call load_state_dict twice on the same AppState object. " "State dict has already been loaded."
55 |             )
56 |         cp_loading = DCPCheckpointLoading(global_rank=dist.get_rank())
57 |         cp_loading.load_checkpoint_(app_state=raw_app_state, checkpoint_dir_path=checkpoint_dir_path)
58 |         return raw_app_state
59 | 


--------------------------------------------------------------------------------
/src/modalities/checkpointing/torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/torch/__init__.py


--------------------------------------------------------------------------------
/src/modalities/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/config/__init__.py


--------------------------------------------------------------------------------
/src/modalities/config/lookup_enum.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class LookupEnum(Enum):
5 |     @classmethod
6 |     def _missing_(cls, value: str) -> type:
7 |         """constructs Enum by member name, if not constructable by value"""
8 |         return cls.__dict__[value]
9 | 


--------------------------------------------------------------------------------
/src/modalities/config/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import torch
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | def convert_base_model_config_to_dict(config: BaseModel) -> dict[Any, Any]:
 8 |     """ "Converts non-recursively a Pydantic BaseModel to a dictionary."""
 9 |     return {key: getattr(config, key) for key in config.model_dump().keys()}
10 | 
11 | 
12 | def parse_torch_device(device: str | int) -> torch.device:
13 |     if isinstance(device, str) and device != "cpu":
14 |         raise ValueError(f"Invalid device_id: {device}")
15 |     elif isinstance(device, int):
16 |         device_id = f"cuda:{device}"
17 |     else:
18 |         device_id = "cpu"
19 |     device = torch.device(device_id)
20 |     return device
21 | 


--------------------------------------------------------------------------------
/src/modalities/conversion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/conversion/__init__.py


--------------------------------------------------------------------------------
/src/modalities/conversion/gpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/conversion/gpt2/__init__.py


--------------------------------------------------------------------------------
/src/modalities/conversion/gpt2/conversion_code.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def _copy_model_files(output_dir: str):
 6 |     source_dir = os.path.dirname(__file__)
 7 |     modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
 8 |     configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
 9 |     shutil.copy(modeling_gpt2_path, output_dir)
10 |     shutil.copy(configuration_gpt2_path, output_dir)
11 | 
12 | 
13 | def _change_modalities_import_to_relative_import(output_dir: str):
14 |     target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py")
15 |     with open(target_modeling_file, "r") as file:
16 |         content = file.read()
17 |     content = content.replace("modalities.conversion.gpt2.configuration_gpt2", ".configuration_gpt2")
18 |     with open(target_modeling_file, "w") as file:
19 |         file.write(content)
20 | 
21 | 
22 | def transfer_model_code(output_dir: str):
23 |     """Copies the required model code to the output directory and replaces modalities imports.
24 |        This allows the converted model to be used without the modalities package via:
25 |        >>> from transformers import AutoModelForCausalLM
26 |        >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True)
27 | 
28 |     Args:
29 |         output_dir (str): Directory of the converted model.
30 |     """
31 |     _copy_model_files(output_dir)
32 |     _change_modalities_import_to_relative_import(output_dir)
33 | 


--------------------------------------------------------------------------------
/src/modalities/dataloader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/dataloader/__init__.py


--------------------------------------------------------------------------------
/src/modalities/dataloader/dataloader_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | from torch.utils.data import BatchSampler
 4 | from torch.utils.data.dataset import Dataset
 5 | 
 6 | from modalities.dataloader.dataloader import LLMDataLoader
 7 | 
 8 | 
 9 | class DataloaderFactory:
10 |     @staticmethod
11 |     def get_dataloader(
12 |         dataloader_tag: str,
13 |         dataset: Dataset,
14 |         batch_sampler: BatchSampler,
15 |         collate_fn: Callable,
16 |         num_workers: int,
17 |         pin_memory: bool,
18 |     ) -> LLMDataLoader:
19 |         """
20 |         Factory method for the instantiation of LLMDataLoader.
21 | 
22 |         Args:
23 |             dataloader_tag (str): Tag for the dataloader
24 |             dataset (Dataset): Dataset to be used
25 |             batch_sampler (BatchSampler): batch sampler for batch-wise sampling from the dataset
26 |             collate_fn (Callable): Callable for shaping the batch
27 |             num_workers (int): Number of workers for the dataloader
28 |             pin_memory (bool): Flag indicating whether to pin memory
29 |         Returns:
30 |             LLMDataLoader: Instance of LLMDataLoader
31 |         """
32 |         dataloader = LLMDataLoader(
33 |             dataloader_tag=dataloader_tag,
34 |             batch_sampler=batch_sampler,
35 |             dataset=dataset,
36 |             collate_fn=collate_fn,
37 |             num_workers=num_workers,
38 |             pin_memory=pin_memory,
39 |         )
40 | 
41 |         return dataloader
42 | 


--------------------------------------------------------------------------------
/src/modalities/dataloader/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/dataloader/preprocessing/__init__.py


--------------------------------------------------------------------------------
/src/modalities/exceptions.py:
--------------------------------------------------------------------------------
 1 | class DatasetNotFoundError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class BatchStateError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class CheckpointingError(Exception):
10 |     pass
11 | 
12 | 
13 | class RunningEnvError(Exception):
14 |     pass
15 | 
16 | 
17 | class TimeRecorderStateError(Exception):
18 |     pass
19 | 
20 | 
21 | class OptimizerError(Exception):
22 |     pass
23 | 
24 | 
25 | class ConfigError(Exception):
26 |     pass
27 | 
28 | 
29 | class ModelStateError(Exception):
30 |     pass
31 | 


--------------------------------------------------------------------------------
/src/modalities/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/inference/__init__.py


--------------------------------------------------------------------------------
/src/modalities/inference/inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pydantic import FilePath
 6 | 
 7 | from modalities.config.component_factory import ComponentFactory
 8 | from modalities.config.config import ProcessGroupBackendType, load_app_config_dict
 9 | from modalities.config.instantiation_models import TextGenerationInstantiationModel
10 | from modalities.inference.text.config import TextInferenceComponentConfig
11 | from modalities.inference.text.inference_component import TextInferenceComponent
12 | from modalities.registry.components import COMPONENTS
13 | from modalities.registry.registry import Registry
14 | from modalities.running_env.cuda_env import CudaEnv
15 | from modalities.running_env.env_utils import is_running_with_torchrun
16 | 
17 | 
18 | def generate_text(config_path: FilePath, registry: Optional[Registry] = None):
19 |     config_dict = load_app_config_dict(config_path)
20 |     if registry is None:
21 |         registry = Registry(COMPONENTS)
22 |     registry.add_entity(
23 |         component_key="inference_component",
24 |         variant_key="text",
25 |         component_type=TextInferenceComponent,
26 |         component_config_type=TextInferenceComponentConfig,
27 |     )
28 |     component_factory = ComponentFactory(registry=registry)
29 | 
30 |     if is_running_with_torchrun():
31 |         with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
32 |             components = component_factory.build_components(
33 |                 config_dict=config_dict,
34 |                 components_model_type=TextGenerationInstantiationModel,
35 |             )
36 | 
37 |     else:
38 |         components = component_factory.build_components(
39 |             config_dict=config_dict,
40 |             components_model_type=TextGenerationInstantiationModel,
41 |         )
42 |     text_inference_component = components.text_inference_component
43 | 
44 |     text_inference_component.run()
45 | 


--------------------------------------------------------------------------------
/src/modalities/inference/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/inference/text/__init__.py


--------------------------------------------------------------------------------
/src/modalities/inference/text/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel, field_validator
 4 | 
 5 | from modalities.config.pydantic_if_types import (
 6 |     PydanticPytorchDeviceType,
 7 |     PydanticPytorchModuleType,
 8 |     PydanticTokenizerIFType,
 9 | )
10 | from modalities.config.utils import parse_torch_device
11 | 
12 | 
13 | class TextInferenceComponentConfig(BaseModel):
14 |     model: PydanticPytorchModuleType
15 |     tokenizer: PydanticTokenizerIFType
16 |     prompt_template: str
17 |     sequence_length: int
18 |     temperature: Optional[float] = 1.0
19 |     eod_token: Optional[str] = "<eod>"
20 |     device: PydanticPytorchDeviceType
21 | 
22 |     @field_validator("device", mode="before")
23 |     def parse_device(cls, device) -> PydanticPytorchDeviceType:
24 |         return parse_torch_device(device)
25 | 


--------------------------------------------------------------------------------
/src/modalities/logging_broker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/logging_broker/__init__.py


--------------------------------------------------------------------------------
/src/modalities/logging_broker/message_broker.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import defaultdict
 3 | 
 4 | from modalities.logging_broker.messages import Message, MessageTypes
 5 | from modalities.logging_broker.subscriber import MessageSubscriberIF
 6 | 
 7 | 
 8 | class MessageBrokerIF(ABC):
 9 |     """Interface for message broker objects."""
10 | 
11 |     @abstractmethod
12 |     def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF):
13 |         raise NotImplementedError
14 | 
15 |     @abstractmethod
16 |     def distribute_message(self, message: Message):
17 |         raise NotImplementedError
18 | 
19 | 
20 | class MessageBroker(MessageBrokerIF):
21 |     """The MessageBroker sends notifications to its subscribers."""
22 | 
23 |     def __init__(self) -> None:
24 |         self.subscriptions: dict[MessageTypes, list[MessageSubscriberIF]] = defaultdict(list)
25 | 
26 |     def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF):
27 |         """Adds a single subscriber."""
28 |         self.subscriptions[subscription].append(subscriber)
29 | 
30 |     def distribute_message(self, message: Message):
31 |         """Distributes message to all subscribers."""
32 |         message_type = message.message_type
33 |         for subscriber in self.subscriptions[message_type]:
34 |             subscriber.consume_message(message=message)
35 | 


--------------------------------------------------------------------------------
/src/modalities/logging_broker/messages.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | from typing import Generic, TypeVar
 4 | 
 5 | 
 6 | class MessageTypes(Enum):
 7 |     HIGH_LEVEL_PROGRESS_UPDATE = "HIGH_LEVEL_PROGRESS_UPDATE"
 8 |     BATCH_PROGRESS_UPDATE = "PROGRESS_UPDATE"
 9 |     ERROR_MESSAGE = "ERROR_MESSAGE"
10 |     EVALUATION_RESULT = "EVALUATION_RESULT"
11 | 
12 | 
13 | T = TypeVar("T")
14 | 
15 | 
16 | @dataclass
17 | class Message(Generic[T]):
18 |     """An object representing a message."""
19 | 
20 |     message_type: MessageTypes
21 |     payload: T
22 |     global_rank: int = 0
23 |     local_rank: int = 0
24 | 
25 | 
26 | class ExperimentStatus(Enum):
27 |     TRAIN = "TRAIN"
28 |     EVALUATION = "EVALUATION"
29 | 
30 | 
31 | @dataclass
32 | class ProgressUpdate:
33 |     """Object holding the state of the current batch / step computation progress."""
34 | 
35 |     num_steps_done: int
36 |     # Note: in case of ExperimentState.TRAIN, dataset_batch_id=global_train_batch_id
37 |     experiment_status: ExperimentStatus
38 |     dataloader_tag: str
39 | 


--------------------------------------------------------------------------------
/src/modalities/logging_broker/publisher.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Generic, TypeVar
 3 | 
 4 | from modalities.logging_broker.message_broker import Message, MessageBroker
 5 | from modalities.logging_broker.messages import MessageTypes
 6 | 
 7 | T = TypeVar("T")
 8 | 
 9 | 
10 | class MessagePublisherIF(ABC, Generic[T]):
11 |     @abstractmethod
12 |     def publish_message(self, payload: T, message_type: MessageTypes):
13 |         raise NotImplementedError
14 | 
15 | 
16 | class MessagePublisher(MessagePublisherIF[T]):
17 |     """The MessagePublisher sends messages through a message broker."""
18 | 
19 |     def __init__(
20 |         self,
21 |         message_broker: MessageBroker,
22 |         global_rank: int,
23 |         local_rank: int,
24 |     ):
25 |         self.message_broker = message_broker
26 |         self.global_rank = global_rank
27 |         self.local_rank = local_rank
28 | 
29 |     def publish_message(self, payload: T, message_type: MessageTypes):
30 |         """Publish a message through the message broker."""
31 |         message = Message[T](
32 |             message_type=message_type,
33 |             global_rank=self.global_rank,
34 |             local_rank=self.local_rank,
35 |             payload=payload,
36 |         )
37 |         self.message_broker.distribute_message(message)
38 | 


--------------------------------------------------------------------------------
/src/modalities/logging_broker/subscriber.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Generic, TypeVar
 3 | 
 4 | from modalities.logging_broker.messages import Message
 5 | 
 6 | T = TypeVar("T")
 7 | 
 8 | 
 9 | class MessageSubscriberIF(ABC, Generic[T]):
10 |     """Interface for message subscribers."""
11 | 
12 |     @abstractmethod
13 |     def consume_message(self, message: Message[T]):
14 |         raise NotImplementedError
15 | 
16 |     @abstractmethod
17 |     def consume_dict(self, message_dict: dict[str, Any]):
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/src/modalities/logging_broker/subscriber_impl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/logging_broker/subscriber_impl/__init__.py


--------------------------------------------------------------------------------
/src/modalities/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/modalities/models/coca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/coca/__init__.py


--------------------------------------------------------------------------------
/src/modalities/models/coca/attention_pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from modalities.nn.attention import AttentionConfig, AttentionType, MultiHeadAttention
 5 | 
 6 | 
 7 | class AttentionPooling(nn.Module):
 8 |     """Attention pooling class."""
 9 | 
10 |     def __init__(self, n_embd: int, n_head: int, bias: bool, epsilon: float, attention_config: AttentionConfig = None):
11 |         """
12 |         Initializes an instance of the AttentionPooling class.
13 | 
14 |         Args:
15 |             n_embd (int): The size of the embeddings.
16 | 
17 |             n_head (int): The number of attention heads.
18 |             bias (bool): Flag indicating whether to include bias in the layer normalization.
19 |             epsilon (float): A small value to avoid division by zero in layer normalization.
20 |             attention_config (AttentionConfig, optional): The configuration for attention mechanism. Defaults to None.
21 | 
22 |         Returns:
23 |             None
24 |         """
25 |         super().__init__()
26 |         self.ln_1 = nn.LayerNorm(normalized_shape=n_embd, bias=bias, eps=epsilon)
27 |         self.attn = MultiHeadAttention(
28 |             n_embd=n_embd,
29 |             n_head=n_head,
30 |             attention_config=attention_config,
31 |             attention_type=AttentionType.CROSS_ATTENTION,
32 |         )
33 |         self.ln_2 = nn.LayerNorm(normalized_shape=n_embd, bias=bias, eps=epsilon)
34 | 
35 |     def forward(self, queries: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
36 |         """
37 |         Forward pass of the attention pooling module.
38 | 
39 |         Args:
40 |             queries (torch.Tensor): The input queries tensor.
41 |             context (torch.Tensor): The input context tensor.
42 | 
43 |         Returns:
44 |             torch.Tensor: The output tensor.
45 |         """
46 |         x = self.ln_1(context)
47 |         x = self.attn(queries, context=x)
48 |         x = self.ln_2(x)
49 |         return x
50 | 


--------------------------------------------------------------------------------
/src/modalities/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/components/__init__.py


--------------------------------------------------------------------------------
/src/modalities/models/gpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/gpt2/__init__.py


--------------------------------------------------------------------------------
/src/modalities/models/gpt2/collator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | from modalities.batch import DatasetBatch
 6 | 
 7 | 
 8 | class CollateFnIF(ABC):
 9 |     """CollateFnIF class to define a collate function interface."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, batch: list[dict[str, torch.Tensor]]) -> DatasetBatch:
13 |         """
14 |         Process a batch of data.
15 | 
16 |         Args:
17 |             batch (list[dict[str, torch.Tensor]]): A list of dictionaries containing tensors.
18 | 
19 |         Returns:
20 |             DatasetBatch: The processed batch of data.
21 | 
22 |         Raises:
23 |             NotImplementedError: This abstract method should be implemented in a subclass.
24 |         """
25 |         raise NotImplementedError
26 | 
27 | 
28 | class GPT2LLMCollateFn(CollateFnIF):
29 |     """GPT2LLMCollateFn class to define a collate function for GPT2 language model."""
30 | 
31 |     def __init__(self, sample_key: str, target_key: str):
32 |         """
33 |         Initializes the Collator object.
34 | 
35 |         Args:
36 |             sample_key (str): The key for accessing the sample data.
37 |             target_key (str): The key for accessing the target data.
38 |         """
39 |         self.sample_key = sample_key
40 |         self.target_key = target_key
41 | 
42 |     def __call__(self, batch: list[dict[str, torch.Tensor]]) -> DatasetBatch:
43 |         """
44 |         Process a batch of data.
45 | 
46 |         Args:
47 |             batch (list[dict[str, torch.Tensor]]): A list of dictionaries containing tensors.
48 | 
49 |         Returns:
50 |             DatasetBatch: A processed batch of data where sample and target sequences are created.
51 | 
52 |         """
53 | 
54 |         sample_tensor = torch.stack([torch.tensor(d[self.sample_key]) for d in batch])
55 |         samples = {self.sample_key: sample_tensor[:, :-1]}
56 |         targets = {self.target_key: sample_tensor[:, 1:]}
57 | 
58 |         return DatasetBatch(targets=targets, samples=samples)
59 | 


--------------------------------------------------------------------------------
/src/modalities/models/gpt2/pretrained_gpt_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import PreTrainedModel
 3 | 
 4 | from modalities.config.config import PretrainedGPTConfig
 5 | from modalities.models.gpt2.gpt2_model import GPT2LLM
 6 | 
 7 | 
 8 | class PretrainedGPTModel(PreTrainedModel):
 9 |     """Pretrained GPT model class."""
10 | 
11 |     config_class = PretrainedGPTConfig
12 | 
13 |     def __init__(self, config: PretrainedGPTConfig):
14 |         """
15 |         Initializes a PretrainedGPTModel object.
16 | 
17 |         Args:
18 |             config (PretrainedGPTConfig): The configuration object for the model.
19 | 
20 |         Returns:
21 |             None
22 |         """
23 |         super().__init__(config)
24 |         # TODO offloading the parameters like this is ugly
25 |         self.model: GPT2LLM = GPT2LLM(**dict(config.config))
26 | 
27 |     def forward(self, tensor):
28 |         """
29 |         Forward pass of the pretrained GPT model.
30 | 
31 |         Args:
32 |             tensor (torch.Tensor): The input tensor.
33 | 
34 |         Returns:
35 |             torch.Tensor: The output tensor.
36 | 
37 |         """
38 |         model_input = {"input_ids": tensor}
39 |         model_forward_output: dict[str, torch.Tensor] = self.model(model_input)
40 |         return model_forward_output[self.config.config.prediction_key]
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     ...
45 | 


--------------------------------------------------------------------------------
/src/modalities/models/huggingface/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/modalities/models/huggingface_adapters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/huggingface_adapters/__init__.py


--------------------------------------------------------------------------------
/src/modalities/models/utils.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from modalities.config.component_factory import ComponentFactory
 6 | from modalities.config.pydantic_if_types import PydanticPytorchModuleType
 7 | from modalities.registry.components import COMPONENTS
 8 | from modalities.registry.registry import Registry
 9 | 
10 | 
11 | class ModelTypeEnum(Enum):
12 |     """
13 |     Enumeration class representing different types of models.
14 | 
15 |     Attributes:
16 |         MODEL (str): Represents a regular model.
17 |         CHECKPOINTED_MODEL (str): Represents a checkpointed model.
18 |     """
19 | 
20 |     MODEL = "model"
21 |     CHECKPOINTED_MODEL = "checkpointed_model"
22 | 
23 | 
24 | def get_model_from_config(config: dict, model_type: ModelTypeEnum):
25 |     """
26 |     Retrieves a model from the given configuration based on the specified model type.
27 | 
28 |     Args:
29 |         config (dict): The configuration dictionary.
30 |         model_type (ModelTypeEnum): The type of the model to retrieve.
31 | 
32 |     Returns:
33 |         Any: The model object based on the specified model type.
34 | 
35 |     Raises:
36 |         NotImplementedError: If the model type is not supported.
37 |     """
38 |     registry = Registry(COMPONENTS)
39 |     component_factory = ComponentFactory(registry=registry)
40 | 
41 |     # create the pydantic config for the component factory dynamically based on model_type
42 |     if model_type.value == "model":
43 | 
44 |         class PydanticConfig(BaseModel):
45 |             model: PydanticPytorchModuleType
46 | 
47 |     elif model_type.value == "checkpointed_model":
48 | 
49 |         class PydanticConfig(BaseModel):
50 |             checkpointed_model: PydanticPytorchModuleType
51 | 
52 |     else:
53 |         raise NotImplementedError()
54 | 
55 |     components = component_factory.build_components(config_dict=config, components_model_type=PydanticConfig)
56 |     return getattr(components, model_type.value)
57 | 


--------------------------------------------------------------------------------
/src/modalities/models/vision_transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/vision_transformer/__init__.py


--------------------------------------------------------------------------------
/src/modalities/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/nn/__init__.py


--------------------------------------------------------------------------------
/src/modalities/nn/mlp.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional
 2 | 
 3 | from torch import Tensor, nn
 4 | 
 5 | 
 6 | class MLP(nn.Module):
 7 |     def __init__(
 8 |         self,
 9 |         in_features: int,
10 |         hidden_features: Optional[int] = None,
11 |         out_features: Optional[int] = None,
12 |         bias: bool = True,
13 |         dropout: float = 0.0,
14 |         act_fn: Callable[[], nn.Module] = nn.GELU,
15 |     ):
16 |         super().__init__()
17 |         out_features = out_features or in_features
18 |         hidden_features = hidden_features or 4 * in_features
19 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
20 |         self.act = act_fn()
21 |         self.drop1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
22 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
23 |         self.drop2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
24 | 
25 |     def forward(self, x: Tensor) -> Tensor:
26 |         x = self.fc1(x)
27 |         x = self.act(x)
28 |         x = self.drop1(x)
29 |         x = self.fc2(x)
30 |         x = self.drop2(x)
31 |         return x
32 | 


--------------------------------------------------------------------------------
/src/modalities/nn/model_initialization/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/modalities/nn/model_initialization/initialization_if.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class ModelInitializationIF(ABC):
 7 |     @abstractmethod
 8 |     def initialize_in_place(self, model: nn.Module):
 9 |         raise NotImplementedError
10 | 


--------------------------------------------------------------------------------
/src/modalities/optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/optimizers/__init__.py


--------------------------------------------------------------------------------
/src/modalities/optimizers/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from typing import Optional
 3 | 
 4 | from torch.optim import Optimizer
 5 | from torch.optim.lr_scheduler import LRScheduler
 6 | 
 7 | 
 8 | class DummyLRScheduler(LRScheduler):
 9 |     def __init__(self, optimizer: Optimizer, last_epoch: Optional[int] = -1):
10 |         super().__init__(optimizer, last_epoch)
11 | 
12 |     def get_lr(self) -> list[float]:
13 |         if not self._get_lr_called_within_step:  # type error expected due to internal pytorch implementation
14 |             warnings.warn(
15 |                 "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning
16 |             )
17 | 
18 |         return [group["lr"] for group in self.optimizer.param_groups]
19 | 
20 |     def _get_closed_form_lr(self) -> list[float]:
21 |         return self.base_lrs
22 | 


--------------------------------------------------------------------------------
/src/modalities/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/preprocessing/__init__.py


--------------------------------------------------------------------------------
/src/modalities/registry/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/registry/__init__.py


--------------------------------------------------------------------------------
/src/modalities/running_env/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/running_env/__init__.py


--------------------------------------------------------------------------------
/src/modalities/running_env/cuda_env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | 
 7 | from modalities.config.config import ProcessGroupBackendType
 8 | 
 9 | 
10 | class CudaEnv:
11 |     """Context manager to set the CUDA environment for distributed training."""
12 | 
13 |     def __init__(
14 |         self,
15 |         process_group_backend: ProcessGroupBackendType,
16 |     ) -> None:
17 |         """Initializes the CudaEnv context manager with the process group backend.
18 | 
19 |         Args:
20 |             process_group_backend (ProcessGroupBackendType): Process group backend to be used for distributed training.
21 |         """
22 |         self.process_group_backend = process_group_backend
23 | 
24 |     def __enter__(self) -> "CudaEnv":
25 |         """Sets the CUDA environment for distributed training.
26 | 
27 |         Returns:
28 |             CudaEnv: Instance of the CudaEnv context manager.
29 |         """
30 |         dist.init_process_group(self.process_group_backend.value)
31 |         local_rank = int(os.getenv("LOCAL_RANK", "-1"))
32 |         if local_rank == -1:
33 |             raise ValueError("LOCAL_RANK environment variable is not set. Please set it before using CudaEnv.")
34 |         torch.cuda.set_device(local_rank)
35 |         return self
36 | 
37 |     def __exit__(self, type: Any, value: Any, traceback: Any):
38 |         """Exits the CUDA environment for distributed training by destroying the process group.
39 | 
40 |         Args:
41 |             type (Any):
42 |             value (Any):
43 |             traceback (Any):
44 |         """
45 |         # TODO and NOTE:
46 |         # when we call barrier here and one of the ranks fails, we get stuck here.
47 |         # In the future, we should probably add a timeout here and handle the case when one of the ranks fails.
48 |         # dist.barrier()
49 |         dist.destroy_process_group()
50 | 


--------------------------------------------------------------------------------
/src/modalities/running_env/env_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.cuda.nccl as nccl
 5 | import torch.distributed as dist
 6 | from pydantic import BaseModel
 7 | 
 8 | # TODO find a solution for github actions
 9 | # to install this as a dependency
10 | # from pkg_resources import packaging
11 | from torch.distributed.fsdp import MixedPrecision
12 | 
13 | from modalities.config.lookup_enum import LookupEnum
14 | 
15 | 
16 | def is_running_with_torchrun():
17 |     # Check for one of the environment variables set by torchrun
18 |     return "LOCAL_RANK" in os.environ
19 | 
20 | 
21 | def has_bfloat_support():
22 |     return (
23 |         torch.version.cuda
24 |         and torch.cuda.is_available()
25 |         and torch.cuda.is_bf16_supported()
26 |         # TODO find a solution for github actions
27 |         # to install this as a dependency
28 |         # and packaging.version.parse(torch.version.cuda).release >= (11, 0)
29 |         and dist.is_nccl_available()
30 |         and nccl.version() >= (2, 10)
31 |     )
32 | 
33 | 
34 | # requires grad scaler in main loop
35 | fpSixteen = MixedPrecision(
36 |     param_dtype=torch.float16,
37 |     # Gradient communication precision.
38 |     reduce_dtype=torch.float16,
39 |     # Buffer precision.
40 |     buffer_dtype=torch.float16,
41 | )
42 | 
43 | bfSixteen = MixedPrecision(
44 |     param_dtype=torch.bfloat16,
45 |     # Gradient communication precision.
46 |     reduce_dtype=torch.bfloat16,
47 |     # Buffer precision.
48 |     buffer_dtype=torch.bfloat16,
49 | )
50 | 
51 | bfSixteen_working = MixedPrecision(
52 |     param_dtype=torch.float32,
53 |     reduce_dtype=torch.bfloat16,
54 |     buffer_dtype=torch.bfloat16,
55 | )
56 | 
57 | megatron_strategy = MixedPrecision(
58 |     param_dtype=torch.bfloat16,
59 |     reduce_dtype=torch.float32,
60 |     # buffer_dtype=torch.bfloat16,
61 | )
62 | 
63 | fpThirtytwo = MixedPrecision(
64 |     param_dtype=torch.float32,
65 |     reduce_dtype=torch.float32,
66 |     buffer_dtype=torch.float32,
67 | )
68 | 
69 | no_mixed_precision = None
70 | 
71 | 
72 | class MixedPrecisionSettings(LookupEnum):
73 |     FP_16 = fpSixteen
74 |     BF_16 = bfSixteen
75 |     BF_16_WORKING = bfSixteen_working
76 |     FP_32 = fpThirtytwo
77 |     MIXED_PRECISION_MEGATRON = megatron_strategy
78 |     NO_MIXED_PRECISION = no_mixed_precision
79 | 
80 | 
81 | class PyTorchDtypes(LookupEnum):
82 |     FP_16 = torch.float16
83 |     FP_32 = torch.float32
84 |     BF_16 = torch.bfloat16
85 | 
86 | 
87 | class FSDP2MixedPrecisionSettings(BaseModel):
88 |     param_dtype: PyTorchDtypes
89 |     reduce_dtype: PyTorchDtypes
90 | 


--------------------------------------------------------------------------------
/src/modalities/running_env/fsdp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/running_env/fsdp/__init__.py


--------------------------------------------------------------------------------
/src/modalities/running_env/fsdp/fsdp_auto_wrapper.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import logging
 3 | from abc import ABC, abstractmethod
 4 | from typing import Callable
 5 | 
 6 | import torch.nn as nn
 7 | from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 8 | 
 9 | from modalities.config.lookup_enum import LookupEnum
10 | from modalities.util import get_module_class_from_name, print_rank_0
11 | 
12 | 
13 | class FSDPAutoWrapFactoryIF(ABC):
14 |     @abstractmethod
15 |     def get_auto_wrap_policy(self) -> Callable:
16 |         raise NotImplementedError
17 | 
18 | 
19 | class FSDPTransformerAutoWrapPolicyFactory(FSDPAutoWrapFactoryIF):
20 |     def __init__(self, model: nn.Module, block_names: list[str]) -> None:
21 |         # TODO it's problematic that we store the model in-memory here. Might get too large in RAM...
22 |         self.model = model
23 |         self.block_names = block_names
24 | 
25 |     @staticmethod
26 |     def _get_fsdp_blocks_from_block_names(model: nn.Module, block_names: list[str]) -> list[nn.Module]:
27 |         fsdp_block_types = []
28 |         for cls_block_name in block_names:
29 |             # TODO FullyShardedDataParallelPlugin from Accelerate uses string matching to find the correct
30 |             # block class. In the long-term we should implmement this ourselves in a robuster fashion.
31 |             block_type = get_module_class_from_name(model, cls_block_name)
32 | 
33 |             if block_type is None:
34 |                 raise ValueError(f"Could not find block with name {cls_block_name} in model")
35 |             fsdp_block_types.append(block_type)
36 |         return fsdp_block_types
37 | 
38 |     def get_auto_wrap_policy(self) -> Callable:
39 |         transformer_layer_cls = self._get_fsdp_blocks_from_block_names(model=self.model, block_names=self.block_names)
40 |         logging.info(f"Wrapped layer classes: {transformer_layer_cls}\n")
41 |         print_rank_0(f"\nWrapped layer classes: {transformer_layer_cls}\n")
42 | 
43 |         if len(transformer_layer_cls) == 0:
44 |             raise ValueError("No FSDP blocks found in model")
45 | 
46 |         auto_wrapper_policy = functools.partial(
47 |             transformer_auto_wrap_policy,
48 |             transformer_layer_cls={
49 |                 *transformer_layer_cls,
50 |             },
51 |         )
52 |         return auto_wrapper_policy
53 | 
54 | 
55 | class FSDPAutoWrapFactoryTypes(LookupEnum):
56 |     FSDPTransformerAutoWrapPolicyFactory = FSDPTransformerAutoWrapPolicyFactory
57 | 


--------------------------------------------------------------------------------
/src/modalities/running_env/fsdp/reducer.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | import torch
 4 | import torch.distributed as dist
 5 | 
 6 | 
 7 | class Reducer:
 8 |     @staticmethod
 9 |     def reduce(
10 |         tensor: torch.Tensor,
11 |         operation: dist.ReduceOp.RedOpType,
12 |         post_processing_fun: Callable[[torch.Tensor], torch.Tensor] = None,
13 |     ):
14 |         dist.all_reduce(tensor, op=operation)
15 |         if post_processing_fun is not None:
16 |             tensor = post_processing_fun(tensor)
17 |         return tensor
18 | 


--------------------------------------------------------------------------------
/src/modalities/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/tokenization/__init__.py


--------------------------------------------------------------------------------
/src/modalities/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/training/__init__.py


--------------------------------------------------------------------------------
/src/modalities/training/activation_checkpointing.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch
 4 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
 5 |     CheckpointImpl,
 6 |     apply_activation_checkpointing,
 7 |     checkpoint_wrapper,
 8 | )
 9 | from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
10 | 
11 | from modalities.util import get_module_class_from_name
12 | 
13 | 
14 | def is_module_to_apply_activation_checkpointing(
15 |     submodule: torch.nn.Module, activation_checkpointing_modules: list[type]
16 | ) -> bool:
17 |     return isinstance(submodule, tuple(activation_checkpointing_modules))
18 | 
19 | 
20 | def apply_activation_checkpointing_inplace(model: torch.nn.Module, activation_checkpointing_modules: list[str]):
21 |     activation_checkpointing_module_types = [
22 |         get_module_class_from_name(model, m) for m in activation_checkpointing_modules
23 |     ]
24 |     if not isinstance(model, FSDP):
25 |         raise ValueError("activation checkpointing can only be applied to FSDP wrapped models!")
26 |     non_reentrant_wrapper = partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, debug=False)
27 | 
28 |     apply_activation_checkpointing(
29 |         model,
30 |         checkpoint_wrapper_fn=non_reentrant_wrapper,
31 |         check_fn=lambda submodule: is_module_to_apply_activation_checkpointing(
32 |             submodule, activation_checkpointing_module_types
33 |         ),
34 |     )
35 | 


--------------------------------------------------------------------------------
/src/modalities/training/gradient_clipping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/training/gradient_clipping/__init__.py


--------------------------------------------------------------------------------
/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | from modalities.config.pydantic_if_types import PydanticPytorchModuleType
 6 | from modalities.training.gradient_clipping.fsdp_gradient_clipper import GradientClippingMode
 7 | 
 8 | 
 9 | class FSDPGradientClipperConfig(BaseModel):
10 |     """
11 |     Configuration class for FSDP gradient clipper.
12 | 
13 |     Args:
14 |         max_norm (float): The maximum norm value for gradient clipping.
15 |         norm_type (GradientClippingMode): The type of gradient clipping to be applied.
16 |         wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model.
17 | 
18 |     Attributes:
19 |         max_norm (float): The maximum norm value for gradient clipping.
20 |         norm_type (GradientClippingMode): The type of gradient clipping to be applied.
21 |         wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model.
22 |     """
23 | 
24 |     max_norm: Annotated[float, Field(strict=True, gt=0)]
25 |     norm_type: GradientClippingMode
26 |     wrapped_model: PydanticPytorchModuleType
27 | 
28 | 
29 | class FSDPDummyGradientClipperConfig(BaseModel):
30 |     """
31 |     Configuration class for FSDP dummy gradient clipper.
32 | 
33 |     Args:
34 |         wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model.
35 |         norm_type (GradientClippingMode): The type of gradient clipping to be applied.
36 | 
37 |     Attributes:
38 |         wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model.
39 |         norm_type (GradientClippingMode): The type of gradient clipping to be applied.
40 |     """
41 | 
42 |     wrapped_model: PydanticPytorchModuleType
43 |     norm_type: GradientClippingMode
44 | 
45 | 
46 | class DummyGradientClipperConfig(BaseModel):
47 |     """
48 |     Configuration class for dummy gradient clipper.
49 | 
50 |     This class is a placeholder and does not have any specific functionality.
51 | 
52 |     Attributes:
53 |         None
54 | 
55 |     Methods:
56 |         None
57 |     """
58 | 
59 |     pass
60 | 


--------------------------------------------------------------------------------
/src/modalities/training/gradient_clipping/gradient_clipper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class GradientClipperIF(ABC):
 7 |     """The GradientClipper interface that defines the methods for clipping gradients."""
 8 | 
 9 |     @abstractmethod
10 |     def clip_gradients(self) -> torch.Tensor:
11 |         """
12 |         Clip the gradients of the model.
13 | 
14 |         Returns:
15 |             torch.Tensor: The clipped gradients.
16 |         """
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/src/modalities/training/training_progress.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class TrainingProgress:
 7 |     """
 8 |     Dataclass to store the training progress.
 9 | 
10 |     Attributes:
11 | 
12 |         num_seen_steps_current_run (int): Number of seen steps in the current run.
13 |         num_seen_tokens_current_run (int): Number of seen tokens in the current run.
14 |         num_target_steps (int): Target number of steps.
15 |         num_target_tokens (int): Target number of tokens.
16 |         num_seen_steps_previous_run (Optional[int]): Number of seen steps in the previous run.
17 |         num_seen_tokens_previous_run (Optional[int]): Number of seen tokens in the previous run.
18 |     """
19 | 
20 |     num_seen_steps_current_run: int
21 |     num_seen_tokens_current_run: int
22 |     num_target_steps: int
23 |     num_target_tokens: int
24 |     num_seen_steps_previous_run: Optional[int] = 0
25 |     num_seen_tokens_previous_run: Optional[int] = 0
26 | 
27 |     @property
28 |     def num_seen_steps_total(self) -> int:
29 |         return self.num_seen_steps_current_run + self.num_seen_steps_previous_run
30 | 
31 |     @property
32 |     def num_seen_tokens_total(self) -> int:
33 |         return self.num_seen_tokens_current_run + self.num_seen_tokens_previous_run
34 | 


--------------------------------------------------------------------------------
/src/modalities/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/utils/__init__.py


--------------------------------------------------------------------------------
/src/modalities/utils/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def get_logger(name: str = "main") -> logging.Logger:
 5 |     logger = logging.getLogger(name)
 6 |     if not logger.handlers:
 7 |         logger.setLevel(logging.DEBUG)
 8 |         handler = logging.StreamHandler()
 9 |         handler.setFormatter(logging.Formatter("%(name)s - %(levelname)s - %(message)s"))
10 |         logger.addHandler(handler)
11 |     return logger
12 | 


--------------------------------------------------------------------------------
/src/modalities/utils/seeding.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | 
 4 | def calculate_hashed_seed(input_data: list[str], max_seed: int = 2**32 - 1) -> int:
 5 |     # Calculate a seed from a list of strings
 6 |     # The seed is a number between 0 and max_seed (non-inclusive max_seed)
 7 |     def _hash_string(input_data: str) -> str:
 8 |         hash_object = hashlib.sha256(input_data.encode("utf-8"))
 9 |         hash_hex = hash_object.hexdigest()
10 |         return hash_hex
11 | 
12 |     # even though this becomes an exremely large integer value,
13 |     # we don't get overflows as python can represent integers of arbitrary size
14 |     # https://docs.python.org/3/library/exceptions.html#OverflowError
15 |     hash_strings = [_hash_string(x) for x in input_data]
16 | 
17 |     hash_sum = sum([int(x, 16) for x in hash_strings])
18 | 
19 |     seed = hash_sum % max_seed  # Ensure the seed fits within the max_seed range
20 | 
21 |     return seed
22 | 


--------------------------------------------------------------------------------
/src/modalities/utils/typing.py:
--------------------------------------------------------------------------------
1 | from torch.distributed.fsdp import FSDPModule as FSDP2
2 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1
3 | 
4 | FSDPX = FSDP1 | FSDP2
5 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/__init__.py


--------------------------------------------------------------------------------
/tests/checkpointing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/checkpointing/__init__.py


--------------------------------------------------------------------------------
/tests/checkpointing/configs_for_testing/gpt2_config_test.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   component_key: model
 3 |   variant_key: gpt2
 4 |   config:
 5 |     sample_key: input_ids
 6 |     poe_type: NOPE
 7 |     sequence_length: 256
 8 |     prediction_key: logits
 9 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
10 |     n_layer: 12
11 |     n_head_q: 12
12 |     n_head_kv: 12
13 |     ffn_hidden: 2048
14 |     n_embd: 768
15 |     dropout: 0.0
16 |     bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
17 |     attention_config:
18 |       qkv_transforms:
19 |         - type_hint: RotaryTransform
20 |           config:
21 |             n_embd: ${model.config.n_embd}
22 |             n_head: ${model.config.n_head_q} #it has to be head_q here
23 |             seq_length_dim: -2
24 |             base_freq: 10000
25 |     attention_implementation: manual
26 |     activation_type: gelu
27 |     attention_norm_config:
28 |       norm_type: rms_norm
29 |       config:
30 |         ndim: ${model.config.n_embd}
31 |         bias: true
32 |         epsilon: 1e-5
33 |     ffn_norm_config:
34 |       norm_type: rms_norm
35 |       config:
36 |         ndim: ${model.config.n_embd}
37 |         bias: true
38 |         epsilon: 1e-5
39 |     lm_head_norm_config:
40 |       norm_type: rms_norm
41 |       config:
42 |         ndim: ${model.config.n_embd}
43 |         bias: true
44 |         epsilon: 1e-5
45 |     use_weight_tying: true
46 | 
47 | checkpointed_model:
48 |   component_key: model
49 |   variant_key: fsdp1_checkpointed
50 |   config:
51 |     checkpoint_loading:
52 |       component_key: checkpoint_loading
53 |       variant_key: torch
54 |       config:
55 |         device: 0
56 |         precision: BF16
57 |     model:
58 |       instance_key: model
59 |       pass_type: BY_REFERENCE
60 |     checkpoint_path: null


--------------------------------------------------------------------------------
/tests/checkpointing/gpt2_config.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: model_raw
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: gpt2
13 |         weight_init_type: scaled
14 |         mean: 0.0
15 |         std: 0.02
16 |         num_layers: ${model_raw.config.n_layer}
17 | 
18 | model_raw:
19 |   component_key: model
20 |   variant_key: gpt2
21 |   config:
22 |     sample_key: "input_ids" # TODO reference this
23 |     poe_type: NOPE
24 |     prediction_key: "logits" # TODO reference this
25 |     sequence_length: 256  # TODO reference this (same as sequence length)
26 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
27 |     n_layer: 2
28 |     n_head_q: 4
29 |     n_head_kv: 4
30 |     ffn_hidden: 128
31 |     n_embd: 128
32 |     dropout: 0.0
33 |     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
34 |     attention_config:
35 |       qkv_transforms:
36 |         - type_hint: RotaryTransform
37 |           config:
38 |             n_embd: ${model_raw.config.n_embd}
39 |             n_head: ${model_raw.config.n_head_q} #it has to be head_q here
40 |             seq_length_dim: -2
41 |             base_freq: 10000
42 |     attention_implementation: manual
43 |     activation_type: gelu
44 |     attention_norm_config:
45 |       norm_type: rms_norm
46 |       config:
47 |         ndim: ${model_raw.config.n_embd}
48 |         bias: true
49 |         epsilon: 1e-5
50 |     ffn_norm_config:
51 |       norm_type: rms_norm
52 |       config:
53 |         ndim: ${model_raw.config.n_embd}
54 |         bias: true
55 |         epsilon: 1e-5
56 |     lm_head_norm_config:
57 |       norm_type: rms_norm
58 |       config:
59 |         ndim: ${model_raw.config.n_embd}
60 |         bias: true
61 |         epsilon: 1e-5
62 |     use_weight_tying: true
63 | 


--------------------------------------------------------------------------------
/tests/checkpointing/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/checkpointing/pytorch/__init__.py


--------------------------------------------------------------------------------
/tests/checkpointing/pytorch/test_torch_checkpoint_loading.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from modalities.checkpointing.torch.torch_checkpoint_loading import TorchCheckpointLoading
 6 | from modalities.config.config import PrecisionEnum
 7 | 
 8 | 
 9 | class DummyModel(nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self._weights = nn.Linear(2, 3)
13 | 
14 |     def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
15 |         output = self._weights(**inputs)
16 |         return {"output": output}
17 | 
18 | 
19 | @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
20 | def test_load_model_checkpoint(tmp_path):
21 |     # After storing the state_dict on disc, the model state does not
22 |     # contain any information about the device or precision
23 |     tmp_file_path = tmp_path / "model_state.pth"
24 | 
25 |     # model that we checkpoint
26 |     model_1 = DummyModel().to(dtype=PrecisionEnum.BF16.value)
27 | 
28 |     # models that we load the checkpoint into
29 |     model_2 = DummyModel().to(dtype=PrecisionEnum.FP16.value)
30 |     model_3 = DummyModel().to(dtype=PrecisionEnum.FP16.value)
31 | 
32 |     # perform checkpointing
33 |     model_state = model_1.state_dict()
34 |     torch.save(model_state, tmp_file_path)
35 | 
36 |     # load the model checkpoint with different settings
37 |     gpu_device = torch.device("cuda:0")
38 |     loaded_model_1: DummyModel = TorchCheckpointLoading(
39 |         device=gpu_device, precision=PrecisionEnum.FP32
40 |     ).load_model_checkpoint(model_2, tmp_file_path)
41 | 
42 |     assert torch.equal(model_1._weights.weight.to(gpu_device), loaded_model_1._weights.weight)
43 |     assert torch.equal(model_1._weights.bias.to(gpu_device), loaded_model_1._weights.bias)
44 | 
45 |     # since we provided the precision, the model will be loaded with the specified precision
46 |     # even if the state dict contains a different precision.
47 |     assert loaded_model_1._weights.weight.dtype == torch.float32
48 |     assert loaded_model_1._weights.weight.device == gpu_device
49 | 
50 |     # if we don't specify the precision, the model will be loaded with the precision of the state dict.
51 |     # In this case, BF16 is used as defined for model_1.
52 |     loaded_model_2: DummyModel = TorchCheckpointLoading(device=gpu_device).load_model_checkpoint(model_3, tmp_file_path)
53 |     assert loaded_model_2._weights.weight.dtype == torch.bfloat16
54 | 


--------------------------------------------------------------------------------
/tests/checkpointing/test_checkpoint_execution_functions.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | import torch.nn as nn
 5 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 6 | 
 7 | from modalities.checkpointing.fsdp.fsdp_checkpoint_saving import FSDP1CheckpointSaving
 8 | from modalities.training.training_progress import TrainingProgress
 9 | 
10 | 
11 | @pytest.mark.skip
12 | def dummy_method(module: nn.Module, flag: bool) -> FSDP:
13 |     raise NotImplementedError
14 | 
15 | 
16 | @pytest.mark.skip
17 | def is_empty_directory(folder_path: str) -> bool:
18 |     path = Path(folder_path)
19 |     return not any(path.iterdir())
20 | 
21 | 
22 | CONTENT = "model"
23 | 
24 | 
25 | def test_get_paths_to_delete(tmp_path):  # pytest temp path
26 |     checkpointing = FSDP1CheckpointSaving(
27 |         checkpoint_path=tmp_path,
28 |         experiment_id=str(1),
29 |         global_rank=0,
30 |     )
31 |     trining_progress = TrainingProgress(
32 |         num_seen_tokens_current_run=5, num_seen_steps_current_run=10, num_target_tokens=40, num_target_steps=20
33 |     )
34 | 
35 |     files_paths_to_delete = checkpointing._get_paths_to_delete(training_progress=trining_progress)
36 |     assert len(files_paths_to_delete) == 2
37 | 
38 | 
39 | def test_delete_checkpoint(tmpdir):
40 |     experiment_id = "2022-05-07__14-31-22"
41 |     training_progress = TrainingProgress(
42 |         num_seen_tokens_current_run=5, num_seen_steps_current_run=10, num_target_tokens=40, num_target_steps=20
43 |     )
44 |     directory = Path(tmpdir)
45 | 
46 |     (directory / experiment_id).mkdir(exist_ok=True)
47 |     optimizer_file_name = (
48 |         f"eid_{experiment_id}-optimizer-seen_steps_{training_progress.num_seen_steps_total}"
49 |         f"-seen_tokens_{training_progress.num_seen_tokens_total}"
50 |         f"-target_steps_{training_progress.num_target_steps}"
51 |         f"-target_tokens_{training_progress.num_target_tokens}.bin"
52 |     )
53 |     optimizer_path = directory / experiment_id / optimizer_file_name
54 |     optimizer_path.write_text(CONTENT)
55 | 
56 |     model_file_name = (
57 |         f"eid_{experiment_id}-model-seen_steps_{training_progress.num_seen_steps_total}"
58 |         f"-seen_tokens_{training_progress.num_seen_tokens_total}"
59 |         f"-target_steps_{training_progress.num_target_steps}"
60 |         f"-target_tokens_{training_progress.num_target_tokens}.bin"
61 |     )
62 |     model_path = directory / experiment_id / model_file_name
63 |     model_path.write_text(CONTENT)
64 | 
65 |     checkpoint_saving = FSDP1CheckpointSaving(
66 |         checkpoint_path=directory,
67 |         experiment_id=experiment_id,
68 |         global_rank=0,
69 |     )
70 |     checkpoint_saving._delete_checkpoint(training_progress=training_progress)
71 |     assert is_empty_directory((directory / experiment_id).__str__())
72 | 


--------------------------------------------------------------------------------
/tests/checkpointing/test_checkpoint_strategies.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from modalities.checkpointing.checkpoint_saving_strategies import SaveKMostRecentCheckpointsStrategy
 4 | from modalities.training.training_progress import TrainingProgress
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "k, saved_instances, checkpoints_to_delete, save_current",
 9 |     [
10 |         # k value is 2. New checkpoint is created and the last one (in the example: [2]) is deleted.
11 |         (2, [TrainingProgress(2, 2, 20, 20), TrainingProgress(1, 1, 20, 20)], [TrainingProgress(1, 1, 20, 20)], True),
12 |         # k value is 0. No deletion of checkpoints.
13 |         (0, [], [], False),
14 |         # k value is 2, but there are currently only one checkpoint. Hence, no deletion.
15 |         (2, [TrainingProgress(1, 1, 20, 20)], [], True),
16 |         # k value is -1, therefore we want to keep all checkpoints without any deletion
17 |         (
18 |             -1,
19 |             [TrainingProgress(3, 3, 20, 20), TrainingProgress(2, 2, 20, 20), TrainingProgress(1, 1, 20, 20)],
20 |             [],
21 |             True,
22 |         ),
23 |     ],
24 | )
25 | def test_checkpoint_strategy_k(
26 |     k: int, saved_instances: list[TrainingProgress], checkpoints_to_delete: list[int], save_current: bool
27 | ) -> None:
28 |     num_seen_steps_current_run = 10
29 |     training_progress = TrainingProgress(
30 |         num_seen_steps_current_run=num_seen_steps_current_run,
31 |         num_seen_tokens_current_run=10,
32 |         num_target_steps=20,
33 |         num_target_tokens=40,
34 |     )
35 |     checkpoint_strategy = SaveKMostRecentCheckpointsStrategy(k=k)
36 |     checkpoint_strategy.saved_step_checkpoints = saved_instances
37 |     checkpoint_instruction = checkpoint_strategy.get_checkpoint_instruction(training_progress=training_progress)
38 | 
39 |     assert checkpoint_instruction.checkpoints_to_delete == checkpoints_to_delete
40 |     assert checkpoint_instruction.save_current == save_current
41 | 
42 |     # make sure that modifying the training progress externally does not affect saved_step_checkpoints
43 |     if k != 0 and save_current:
44 |         training_progress.num_seen_steps_current_run = 100
45 |         assert checkpoint_strategy.saved_step_checkpoints[0].num_seen_steps_current_run == num_seen_steps_current_run
46 | 


--------------------------------------------------------------------------------
/tests/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/config/__init__.py


--------------------------------------------------------------------------------
/tests/config/components.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class Component_V_W_X_IF:
 5 |     def print(self) -> None:
 6 |         print("ComponentIF")
 7 | 
 8 | 
 9 | # Dependencies
10 | 
11 | 
12 | class ComponentV(Component_V_W_X_IF):
13 |     def __init__(self, val_v: str) -> None:
14 |         self.val_v = val_v
15 | 
16 | 
17 | class ComponentW(Component_V_W_X_IF):
18 |     def __init__(self, val_w: str) -> None:
19 |         self.val_w = val_w
20 | 
21 | 
22 | # Components
23 | 
24 | 
25 | class ComponentX(Component_V_W_X_IF):
26 |     def __init__(self, val_x: str, single_dependency: Component_V_W_X_IF) -> None:
27 |         self.val_x = val_x
28 |         self.single_dependency = single_dependency
29 | 
30 | 
31 | class ComponentY:
32 |     def __init__(self, val_y: str, multi_dependency: list[Component_V_W_X_IF]) -> None:
33 |         self.val_y = val_y
34 |         self.multi_dependency = multi_dependency
35 | 
36 | 
37 | class ComponentZ:
38 |     def __init__(self, val_z: str) -> None:
39 |         self.val_z = val_z
40 | 
41 | 
42 | class ComponentTypes(Enum):
43 |     COMP_V = ComponentV
44 |     COMP_W = ComponentW
45 |     COMP_X = ComponentX
46 |     COMP_Y = ComponentY
47 |     COMP_Z = ComponentZ
48 | 


--------------------------------------------------------------------------------
/tests/config/configs.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from modalities.config.pydantic_if_types import PydanticThirdPartyTypeIF
 6 | from tests.config.components import Component_V_W_X_IF
 7 | 
 8 | PydanticComponent_V_W_X_IF_Type = Annotated[Component_V_W_X_IF, PydanticThirdPartyTypeIF(Component_V_W_X_IF)]
 9 | 
10 | 
11 | class CompVConfig(BaseModel):
12 |     val_v: str
13 | 
14 | 
15 | class CompWConfig(BaseModel):
16 |     val_w: str
17 | 
18 | 
19 | class CompXConfig(BaseModel):
20 |     val_x: str
21 |     single_dependency: PydanticComponent_V_W_X_IF_Type
22 | 
23 | 
24 | class CompYConfig(BaseModel):
25 |     val_y: str
26 |     multi_dependency: list[PydanticComponent_V_W_X_IF_Type]
27 | 
28 | 
29 | class CompZConfig(BaseModel):
30 |     val_z: str
31 | 


--------------------------------------------------------------------------------
/tests/config/custom_components.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from enum import Enum
 3 | from typing import Literal
 4 | 
 5 | from pydantic import BaseModel, validator
 6 | 
 7 | 
 8 | class CustomComponent1:
 9 |     def __init__(self, val_1: str) -> None:
10 |         self.val_1 = val_1
11 | 
12 | 
13 | class CustomComponentTypes(Enum):
14 |     CUSTOM_COMP_1 = CustomComponent1
15 | 
16 | 
17 | class CustomCompConfigABC(BaseModel, ABC):
18 |     # TODO make this a string and then implement the mapping
19 |     # to the class outside of the basemodel (i.e. in the factory)
20 |     type_hint: Enum
21 | 
22 |     @validator("type_hint", pre=True, allow_reuse=True, check_fields=False)
23 |     def _string_to_enum(cls, key: str):
24 |         if isinstance(key, str):
25 |             try:
26 |                 key = CustomComponentTypes[key]
27 |             except KeyError as e:
28 |                 raise ValueError(f"{key} is not a valid ComponentType") from e
29 |             return key
30 |         return key
31 | 
32 | 
33 | class CustomComp1Config(CustomCompConfigABC):
34 |     type_hint: Literal[CustomComponentTypes.CUSTOM_COMP_1]
35 |     val_1: str
36 | 


--------------------------------------------------------------------------------
/tests/config/test_configs/config_backward_reference.yaml:
--------------------------------------------------------------------------------
 1 | comp_x_1:
 2 |     component_key: COMP_X
 3 |     variant_key: default
 4 |     config:
 5 |       val_x: "some other value X"
 6 |       single_dependency:
 7 |           component_key: COMP_W
 8 |           variant_key: default
 9 |           config:
10 |             val_w: "some other value w"
11 | 
12 | comp_y_1:
13 |     component_key: COMP_Y
14 |     variant_key: default
15 |     config:
16 |       val_y: "some other value y"
17 |       multi_dependency:
18 |           - component_key: COMP_W
19 |             variant_key: default
20 |             config:
21 |               val_w: "some other value w"
22 |           - component_key: COMP_V
23 |             variant_key: default
24 |             config:
25 |               val_v: "some other value v"
26 |           - instance_key: comp_x_1
27 |             pass_type: BY_REFERENCE
28 | 
29 | 


--------------------------------------------------------------------------------
/tests/config/test_configs/config_forward_reference.yaml:
--------------------------------------------------------------------------------
 1 | comp_y_1:
 2 |     component_key: COMP_Y
 3 |     variant_key: default
 4 |     config:
 5 |       val_y: "some other value y"
 6 |       multi_dependency:
 7 |           - component_key: COMP_W
 8 |             variant_key: default
 9 |             config:
10 |               val_w: "some other value w"
11 |           - component_key: COMP_V
12 |             variant_key: default
13 |             config:
14 |               val_v: "some other value v"
15 |           - instance_key: comp_x_1
16 |             pass_type: BY_REFERENCE
17 | 
18 | comp_x_1:
19 |     component_key: COMP_X
20 |     variant_key: default
21 |     config:
22 |       val_x: "some other value X"
23 |       single_dependency:
24 |           component_key: COMP_W
25 |           variant_key: default
26 |           config:
27 |             val_w: "some other value w"


--------------------------------------------------------------------------------
/tests/config/test_configs/config_hierarchical_list_component.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | comp_y_1:
 3 |     component_key: COMP_Y
 4 |     variant_key: default
 5 |     config:
 6 |       val_y: "some other value y"
 7 |       multi_dependency:
 8 |         - component_key: COMP_W
 9 |           variant_key: default
10 |           config:
11 |             val_w: "some other value w"
12 |         - component_key: COMP_V
13 |           variant_key: default
14 |           config:
15 |             val_v: "some other value v"


--------------------------------------------------------------------------------
/tests/config/test_configs/config_multiple_top_level_components_with_references.yaml:
--------------------------------------------------------------------------------
 1 | # we want to test that comp_x_1->val_x is not referencing
 2 | # top-level component val_x
 3 | val_x:    
 4 |     component_key: COMP_X
 5 |     variant_key: default
 6 |     config:
 7 |       val_x: "val_x -> config -> val_x"
 8 |       single_dependency:
 9 |           component_key: COMP_W
10 |           variant_key: default
11 |           config:
12 |             val_w: "val_w_123"
13 | 
14 | single_dependency:
15 |     component_key: COMP_W
16 |     variant_key: default
17 |     config:
18 |       val_w: "single_dependency -> config -> val_w"
19 | 
20 | comp_x_1:
21 |     component_key: COMP_X
22 |     variant_key: default
23 |     config:
24 |       val_x: "comp_x_1 -> config -> val_x"
25 |       single_dependency:
26 |         component_key: COMP_W
27 |         variant_key: default
28 |         config:
29 |           val_w: "val_w_123"
30 | 
31 | 
32 | # we want to check that comp_x_2 and comp_x_3 are pointing 
33 | # to the same instance of comp_w_1 and
34 | # comp_x_1 is not pointing to comp_w_1
35 | comp_x_2:
36 |     component_key: COMP_X
37 |     variant_key: default
38 |     config:
39 |       val_x: "comp_x_2 -> config -> val_x"
40 |       single_dependency:
41 |         instance_key: comp_w_1
42 |         pass_type: BY_REFERENCE
43 | 
44 | comp_w_1:
45 |   component_key: COMP_W
46 |   variant_key: default
47 |   config:
48 |     val_w: "comp_w_1 -> comp_w"
49 | 
50 | comp_x_3:
51 |     component_key: COMP_X
52 |     variant_key: default
53 |     config:
54 |       val_x: "comp_x_3 -> config -> val_x"
55 |       single_dependency:
56 |         instance_key: comp_w_1
57 |         pass_type: BY_REFERENCE
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/tests/config/test_configs/config_non_existing_reference.yaml:
--------------------------------------------------------------------------------
 1 | comp_y_1:
 2 |     component_key: COMP_Y
 3 |     variant_key: default
 4 |     config:
 5 |       val_y: "some other value y"
 6 |     multi_dependency:
 7 |         - component_key: COMP_W
 8 |           variant_key: default
 9 |           config:
10 |             val_w: "some other value w"
11 |         - component_key: COMP_V
12 |           variant_key: default
13 |           config:
14 |             val_v: "some other value v"
15 |         - instance_key: comp_x_1
16 |           pass_type: BY_REFERENCE
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/config/test_configs/config_single_component.yaml:
--------------------------------------------------------------------------------
1 | custom_comp_1:
2 |     component_key: COMP_V
3 |     variant_key: default
4 |     config:
5 |         val_v: "some value v"


--------------------------------------------------------------------------------
/tests/conversion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/conversion/__init__.py


--------------------------------------------------------------------------------
/tests/conversion/gpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/conversion/gpt2/__init__.py


--------------------------------------------------------------------------------
/tests/conversion/gpt2/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | import torch
 7 | 
 8 | from modalities.config.config import load_app_config_dict
 9 | from modalities.models.gpt2.gpt2_model import GPT2LLM
10 | from modalities.models.utils import ModelTypeEnum, get_model_from_config
11 | from tests.conftest import _ROOT_DIR
12 | 
13 | 
14 | @pytest.fixture
15 | def gpt2_config_path(tmpdir_factory: pytest.TempdirFactory, initialized_model: GPT2LLM, config_file_path: str) -> str:
16 |     tmp_path = tmpdir_factory.mktemp("gpt2_model")
17 |     new_config_filename = tmp_path / "gpt2_config_test.yaml"
18 |     model_path = tmp_path / "model.pth"
19 |     shutil.copy(config_file_path, new_config_filename)
20 |     torch.save(initialized_model.state_dict(), model_path)
21 |     with open(new_config_filename, "r") as file:
22 |         content = file.read()
23 |     content = content.replace("checkpoint_path: null", f"checkpoint_path: {model_path}")
24 |     with open(new_config_filename, "w") as file:
25 |         file.write(content)
26 |     return str(new_config_filename)
27 | 
28 | 
29 | @pytest.fixture()
30 | def initialized_model(set_env, modalities_config_dict: dict) -> GPT2LLM:
31 |     model = get_model_from_config(config=modalities_config_dict, model_type=ModelTypeEnum.MODEL)
32 |     assert isinstance(model, GPT2LLM)
33 |     return model
34 | 
35 | 
36 | @pytest.fixture()
37 | def set_env():
38 |     os.environ["LOCAL_RANK"] = "0"
39 |     os.environ["RANK"] = "0"
40 |     os.environ["WORLD_SIZE"] = "1"
41 | 
42 | 
43 | @pytest.fixture()
44 | def modalities_config_dict(config_file_path: Path) -> dict:
45 |     return load_app_config_dict(config_file_path=config_file_path)
46 | 
47 | 
48 | @pytest.fixture()
49 | def config_file_path(config_file_name: str) -> Path:
50 |     config_file_path = _ROOT_DIR / Path("tests/conversion/test_configs/" + config_file_name)
51 |     return config_file_path
52 | 
53 | 
54 | @pytest.fixture(params=["gpt2_config_test.yaml"])
55 | def config_file_name(request) -> str:
56 |     return request.param
57 | 


--------------------------------------------------------------------------------
/tests/conversion/gpt2/helper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
 5 | from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
 6 | 
 7 | 
 8 | def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
 9 |     converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
10 |     assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
11 |     for i, (llama_layer, modalities_layer) in enumerate(
12 |         zip(converted_model.model.layers, modalities_model.transformer.h)
13 |     ):
14 |         check_same_weight_attention(llama_layer, modalities_layer)
15 |         check_same_weight_mlp(llama_layer, modalities_layer)
16 |         check_same_weight_layer_norms(llama_layer, modalities_layer)
17 |     check_same_weight_base_modules(converted_model.lm_head, modalities_model.transformer.lm_head)
18 |     check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
19 | 
20 | 
21 | def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
22 |     check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
23 |     check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
24 |     check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
25 |     check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
26 | 
27 | 
28 | def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
29 |     check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
30 |     check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
31 |     check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
32 | 
33 | 
34 | def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
35 |     check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
36 |     check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
37 | 
38 | 
39 | def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
40 |     assert torch.equal(l1.weight, l2.weight)
41 |     assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
42 | 


--------------------------------------------------------------------------------
/tests/conversion/gpt2/test_conversion_code.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from modalities.conversion.gpt2.conversion_code import transfer_model_code
 4 | 
 5 | 
 6 | def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path):
 7 |     modeling_gpt2_path = tmp_path / "modeling_gpt2.py"
 8 |     assert not modeling_gpt2_path.exists()
 9 |     transfer_model_code(str(tmp_path))
10 |     assert modeling_gpt2_path.exists()
11 | 
12 | 
13 | def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path):
14 |     configuration_gpt2_path = tmp_path / "configuration_gpt2.py"
15 |     assert not configuration_gpt2_path.exists()
16 |     transfer_model_code(str(tmp_path))
17 |     assert configuration_gpt2_path.exists()
18 | 
19 | 
20 | def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path):
21 |     transfer_model_code(str(tmp_path))
22 |     with open(tmp_path / "modeling_gpt2.py") as f:
23 |         text = f.read()
24 |         assert "from modalities" not in text
25 |         assert "import modalities" not in text
26 | 
27 | 
28 | def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path):
29 |     transfer_model_code(str(tmp_path))
30 |     with open(tmp_path / "configuration_gpt2.py") as f:
31 |         text = f.read()
32 |         assert "from modalities" not in text
33 |         assert "import modalities" not in text
34 | 


--------------------------------------------------------------------------------
/tests/conversion/gpt2/test_conversion_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from modalities.config.config import load_app_config_dict
 6 | from modalities.conversion.gpt2.conversion_model import (
 7 |     _copy_weights_base_modules,
 8 |     check_converted_model,
 9 |     convert_model_checkpoint,
10 | )
11 | from tests.conversion.gpt2.helper import check_same_weight_base_modules, check_same_weight_model
12 | 
13 | 
14 | def test_convert_model_can_generate(gpt2_config_path: str):
15 |     modalities_config = load_app_config_dict(gpt2_config_path)
16 |     hf_model, _ = convert_model_checkpoint(modalities_config)
17 |     assert hf_model.can_generate()
18 | 
19 | 
20 | def test_convert_model_checkpoint_does_not_change_weights(gpt2_config_path: str):
21 |     modalities_config = load_app_config_dict(gpt2_config_path)
22 |     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
23 |     check_same_weight_model(hf_model, modalities_model)
24 | 
25 | 
26 | def test_convert_model_checkpoint_produces_same_logits_as_original(gpt2_config_path: str):
27 |     modalities_config = load_app_config_dict(gpt2_config_path)
28 |     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
29 |     vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
30 |     check_converted_model(hf_model, modalities_model, num_testruns=1, vocab_size=vocab_size)
31 | 
32 | 
33 | def test_copying_base_modules_weights_yields_identical_modules():
34 |     m1 = nn.Linear(10, 10, bias=True)
35 |     m2 = nn.Linear(10, 10, bias=True)
36 |     m2.weight.data = torch.randn(10, 10)
37 |     m2.bias.data = torch.randn(10)
38 | 
39 |     _copy_weights_base_modules(m1, m2)
40 | 
41 |     check_same_weight_base_modules(m1, m2)
42 | 
43 | 
44 | def test_copying_base_modules_works_when_bias_is_false():
45 |     m1 = nn.Linear(10, 10, bias=False)
46 |     m2 = nn.Linear(10, 10, bias=False)
47 |     m2.weight.data = torch.randn(10, 10)
48 | 
49 |     _copy_weights_base_modules(m1, m2)
50 | 
51 |     check_same_weight_base_modules(m1, m2)
52 | 
53 | 
54 | def test_copying_base_modules_fails_if_bias_settings_mismatch():
55 |     m1 = nn.Linear(10, 10, bias=False)
56 |     m2 = nn.Linear(10, 10, bias=True)
57 |     m2.weight.data = torch.randn(10, 10)
58 |     m2.bias.data = torch.randn(10)
59 | 
60 |     with pytest.raises(AttributeError):
61 |         _copy_weights_base_modules(m1, m2)
62 | 


--------------------------------------------------------------------------------
/tests/conversion/gpt2/test_conversion_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | from transformers import LlamaTokenizer
 5 | 
 6 | from modalities.conversion.gpt2.conversion_tokenizer import convert_tokenizer
 7 | from modalities.tokenization.tokenizer_wrapper import PreTrainedSPTokenizer
 8 | 
 9 | 
10 | def test_converted_tokenizer_produces_same_tokens_as_original(
11 |     converted_tokenizer: LlamaTokenizer, sp_tokenizer: PreTrainedSPTokenizer, text: str
12 | ):
13 |     converted_token_ids = converted_tokenizer(text)
14 |     sp_token_ids = sp_tokenizer.tokenize(text)
15 |     assert converted_token_ids["input_ids"] == sp_token_ids, "Converted token IDs do not match original token IDs."
16 | 
17 | 
18 | def test_converted_tokenizer_detokenizes_same_as_original(
19 |     converted_tokenizer: LlamaTokenizer, sp_tokenizer: PreTrainedSPTokenizer, token_ids: list[int]
20 | ):
21 |     converted_tokens = converted_tokenizer.decode(token_ids)
22 |     sp_tokens = sp_tokenizer.decode(token_ids)
23 |     assert converted_tokens == sp_tokens, "Decoded tokens do not match between converted and original tokenizers."
24 | 
25 | 
26 | @pytest.fixture
27 | def converted_tokenizer(tmp_path: Path, tokenizer_model_file: str) -> LlamaTokenizer:
28 |     convert_tokenizer(tokenizer_model_path=tokenizer_model_file, output_dir=tmp_path)
29 |     return LlamaTokenizer.from_pretrained(tmp_path)
30 | 
31 | 
32 | @pytest.fixture
33 | def sp_tokenizer(tokenizer_model_file: str) -> PreTrainedSPTokenizer:
34 |     return PreTrainedSPTokenizer(tokenizer_model_file=tokenizer_model_file)
35 | 
36 | 
37 | @pytest.fixture
38 | def tokenizer_model_file() -> str:
39 |     return "data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model"
40 | 
41 | 
42 | @pytest.fixture(
43 |     params=[
44 |         "<unk><s></s><pad><eod><placeholder_tok_0><placeholder_tok_1><placeholder_tok_2>",
45 |         "<s>Hello,\n my dog is cute",
46 |         "the secret phrase is ossifrage",
47 |     ]
48 | )
49 | def text(request: pytest.FixtureRequest) -> str:
50 |     return request.param
51 | 
52 | 
53 | @pytest.fixture(
54 |     params=[
55 |         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
56 |         [0, 20527, 1, 20527, 2, 20527, 3, 20527, 4, 20527, 5, 20527, 6, 20527],
57 |         [1, 20527],
58 |     ]
59 | )
60 | def token_ids(request: pytest.FixtureRequest) -> list[int]:
61 |     return request.param
62 | 


--------------------------------------------------------------------------------
/tests/conversion/gpt2/test_convert_gpt2.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | import torch
 5 | from transformers import AutoModelForCausalLM, PreTrainedModel
 6 | 
 7 | from modalities.config.config import load_app_config_dict
 8 | from modalities.conversion.gpt2.conversion_model import check_converted_model
 9 | from modalities.conversion.gpt2.convert_gpt2 import convert_gpt2
10 | from modalities.models.gpt2.gpt2_model import GPT2LLM
11 | from modalities.models.utils import ModelTypeEnum, get_model_from_config
12 | from tests.conversion.gpt2.helper import check_same_weight_model
13 | 
14 | 
15 | def test_converting_gpt2_does_not_change_weights(converted_model: PreTrainedModel, original_model: GPT2LLM):
16 |     check_same_weight_model(converted_model, original_model)
17 | 
18 | 
19 | def test_converting_gpt2_does_not_change_outputs(
20 |     converted_model: PreTrainedModel, original_model: GPT2LLM, vocab_size: int
21 | ):
22 |     check_converted_model(
23 |         hf_model=converted_model, modalities_model=original_model, num_testruns=1, vocab_size=vocab_size
24 |     )
25 | 
26 | 
27 | @pytest.fixture
28 | def converted_model(run_convert_gpt2: None, output_dir: Path) -> PreTrainedModel:
29 |     return AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True).to(
30 |         dtype=torch.bfloat16
31 |     )
32 | 
33 | 
34 | @pytest.fixture
35 | def run_convert_gpt2(gpt2_config_path: str, output_dir: Path):
36 |     convert_gpt2(gpt2_config_path, output_dir)
37 | 
38 | 
39 | @pytest.fixture
40 | def original_model(gpt2_config_path: str) -> GPT2LLM:
41 |     modalities_config = load_app_config_dict(gpt2_config_path)
42 |     return get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
43 | 
44 | 
45 | @pytest.fixture
46 | def vocab_size(gpt2_config_path: str) -> int:
47 |     modalities_config = load_app_config_dict(gpt2_config_path)
48 |     return modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
49 | 
50 | 
51 | @pytest.fixture
52 | def output_dir(tmp_path: Path) -> Path:
53 |     return tmp_path / "output"
54 | 


--------------------------------------------------------------------------------
/tests/conversion/test_configs/gpt2_config_test.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   component_key: model
 3 |   variant_key: gpt2
 4 |   config:
 5 |     sample_key: input_ids
 6 |     poe_type: NOPE
 7 |     sequence_length: 128
 8 |     prediction_key: logits
 9 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
10 |     n_layer: 3
11 |     n_head_q: 4
12 |     n_head_kv: 4
13 |     ffn_hidden: 512
14 |     n_embd: 256
15 |     dropout: 0.0
16 |     bias: false # True: bias in Linears, like GPT-2. False: a bit better and faster
17 |     attention_config:
18 |       qkv_transforms:
19 |       - type_hint: RotaryTransform
20 |         config:
21 |           n_embd: ${model.config.n_embd}
22 |           n_head: ${model.config.n_head_q} #it has to be head_q here
23 |           seq_length_dim: -2
24 |           base_freq: 500000
25 |     attention_implementation: pytorch_flash # manual
26 |     activation_type: swiglu
27 |     attention_norm_config:
28 |       norm_type: layer_norm
29 |       config:
30 |         normalized_shape: ${model.config.n_embd}
31 |         eps: 1e-5
32 |         bias: true
33 |     ffn_norm_config:
34 |       norm_type: layer_norm
35 |       config:
36 |         normalized_shape: ${model.config.n_embd}
37 |         eps: 1e-5
38 |         bias: true
39 |     lm_head_norm_config:
40 |       norm_type: layer_norm
41 |       config:
42 |         normalized_shape: ${model.config.n_embd}
43 |         eps: 1e-5
44 |         bias: true
45 |     use_weight_tying: true
46 | 
47 | checkpointed_model:
48 |   component_key: model
49 |   variant_key: fsdp1_checkpointed
50 |   config:
51 |     checkpoint_loading:
52 |       component_key: checkpoint_loading
53 |       variant_key: torch
54 |       config:
55 |         device: cpu
56 |         precision: BF16
57 |     model:
58 |       instance_key: model
59 |       pass_type: BY_REFERENCE
60 |     checkpoint_path: null


--------------------------------------------------------------------------------
/tests/data/datasets/lorem_ipsum_long.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/data/datasets/lorem_ipsum_long.idx


--------------------------------------------------------------------------------
/tests/data/datasets/lorem_ipsum_long.pbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/data/datasets/lorem_ipsum_long.pbin


--------------------------------------------------------------------------------
/tests/dataloader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/__init__.py


--------------------------------------------------------------------------------
/tests/dataloader/distributed/dist_dataloader_config_with_shuffling.yaml:
--------------------------------------------------------------------------------
 1 | train_dataset:
 2 |   component_key: dataset
 3 |   variant_key: test
 4 |   config:
 5 |     num_samples: 8
 6 | 
 7 | train_dataloader:
 8 |   component_key: data_loader
 9 |   variant_key: default
10 |   config:
11 |     num_workers: 2
12 |     pin_memory: true
13 |     dataloader_tag: train
14 |     dataset:
15 |       instance_key: train_dataset
16 |       pass_type: BY_REFERENCE
17 |     batch_sampler:
18 |       component_key: batch_sampler
19 |       variant_key: default
20 |       config:
21 |         batch_size: 2
22 |         drop_last: true
23 |         sampler:
24 |           component_key: sampler
25 |           variant_key: resumable_distributed_sampler
26 |           config:
27 |             rank: ${cuda_env:RANK}
28 |             num_replicas: ${cuda_env:WORLD_SIZE}
29 |             shuffle: true
30 |             seed: 0
31 |             skip_num_global_samples: 0
32 |             dataset:
33 |               instance_key: train_dataset
34 |               pass_type: BY_REFERENCE


--------------------------------------------------------------------------------
/tests/dataloader/distributed/dist_dataloader_config_with_shuffling_and_skipped_batches.yaml:
--------------------------------------------------------------------------------
 1 | train_dataset:
 2 |   component_key: dataset
 3 |   variant_key: test
 4 |   config:
 5 |     num_samples: 8
 6 | 
 7 | train_dataloader:
 8 |   component_key: data_loader
 9 |   variant_key: default
10 |   config:
11 |     num_workers: 2
12 |     pin_memory: true
13 |     dataloader_tag: train
14 |     dataset:
15 |       instance_key: train_dataset
16 |       pass_type: BY_REFERENCE
17 |     batch_sampler:
18 |       component_key: batch_sampler
19 |       variant_key: default
20 |       config:
21 |         batch_size: 2
22 |         drop_last: true
23 |         sampler:
24 |           component_key: sampler
25 |           variant_key: resumable_distributed_sampler
26 |           config:
27 |             rank: ${cuda_env:RANK}
28 |             num_replicas: ${cuda_env:WORLD_SIZE}
29 |             shuffle: true
30 |             seed: 0
31 |             skip_num_global_samples: 4 # num_batches (1) * world_size (2) * local_micro_batch_size (2)
32 |             dataset:
33 |               instance_key: train_dataset
34 |               pass_type: BY_REFERENCE


--------------------------------------------------------------------------------
/tests/dataloader/distributed/dist_dataloader_config_without_shuffling.yaml:
--------------------------------------------------------------------------------
 1 | train_dataset:
 2 |   component_key: dataset
 3 |   variant_key: test
 4 |   config:
 5 |     num_samples: 8
 6 | 
 7 | train_dataloader:
 8 |   component_key: data_loader
 9 |   variant_key: default
10 |   config:
11 |     num_workers: 2
12 |     pin_memory: true
13 |     dataloader_tag: train
14 |     dataset:
15 |       instance_key: train_dataset
16 |       pass_type: BY_REFERENCE
17 |     batch_sampler:
18 |       component_key: batch_sampler
19 |       variant_key: default
20 |       config:
21 |         batch_size: 2
22 |         drop_last: true
23 |         sampler:
24 |           component_key: sampler
25 |           variant_key: resumable_distributed_sampler
26 |           config:
27 |             rank: ${cuda_env:RANK}
28 |             num_replicas: ${cuda_env:WORLD_SIZE}
29 |             shuffle: false
30 |             skip_num_global_samples: 0
31 |             dataset:
32 |               instance_key: train_dataset
33 |               pass_type: BY_REFERENCE


--------------------------------------------------------------------------------
/tests/dataloader/dummy_sequential_dataset.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from torch.utils.data.dataset import Dataset as TorchdataSet
 3 | 
 4 | 
 5 | class TestDataset(TorchdataSet):
 6 |     def __init__(self, num_samples: int):
 7 |         self.samples = list(range(num_samples))
 8 | 
 9 |     def __len__(self) -> int:
10 |         return len(self.samples)
11 | 
12 |     def __getitem__(self, idx: int) -> dict:
13 |         return self.samples[idx]
14 | 
15 | 
16 | class TestDatasetConfig(BaseModel):
17 |     num_samples: int
18 | 


--------------------------------------------------------------------------------
/tests/dataloader/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/__init__.py


--------------------------------------------------------------------------------
/tests/dataloader/preprocessing/chunking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/chunking/__init__.py


--------------------------------------------------------------------------------
/tests/dataloader/preprocessing/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/dataloader/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/dataloader/samplers/test_sequential_samplers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from torch.utils.data import Dataset, SequentialSampler
 3 | 
 4 | 
 5 | class DummyDataset(Dataset):
 6 |     def __init__(self, num_samples):
 7 |         self.data = list(range(num_samples))
 8 | 
 9 |     def __len__(self):
10 |         return len(self.data)
11 | 
12 |     def __getitem__(self, index):
13 |         return self.data[index]
14 | 
15 | 
16 | @pytest.mark.parametrize(
17 |     "num_samples, world_size",
18 |     [
19 |         (10, 3),
20 |         (15, 4),
21 |     ],
22 | )
23 | def test_distributed_setting(num_samples, world_size):
24 |     dataset = DummyDataset(num_samples)
25 |     samplers = [SequentialSampler(dataset) for _ in range(world_size)]
26 | 
27 |     expected_indices = list(range(num_samples))
28 |     # Ensures that all ranks receive the exact same samples in the same order
29 |     assert all(list(sampler) == expected_indices for sampler in samplers)
30 | 


--------------------------------------------------------------------------------
/tests/dataloader/test_combined_dataset.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from modalities.dataloader.dataset import CombinedDataset
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def dummy_dataset_1() -> list[int]:
 8 |     return list(range(10))
 9 | 
10 | 
11 | @pytest.fixture
12 | def dummy_dataset_2() -> list[int]:
13 |     return list(range(10, 15))
14 | 
15 | 
16 | def test_combined_dataset(dummy_dataset_1: list[int], dummy_dataset_2: list[int]):
17 |     combined_dataset = CombinedDataset(datasets=[dummy_dataset_1, dummy_dataset_2])
18 | 
19 |     # check that length is calculated correctly
20 |     assert len(combined_dataset) == 15
21 | 
22 |     # check that the elements are iterated over in order
23 |     assert [i for i in combined_dataset] == list(range(15))
24 | 
25 |     # check that we throw an error when trying to access an index that is out of bounds
26 |     with pytest.raises(IndexError):
27 |         combined_dataset[15]
28 | 


--------------------------------------------------------------------------------
/tests/dataloader/test_dummy_dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from modalities.dataloader.dataset import DummyDataset, DummySampleConfig, DummySampleDataType
 4 | 
 5 | 
 6 | def test_dummy_dataset():
 7 |     dataset = DummyDataset(
 8 |         num_samples=50,
 9 |         sample_definition=[
10 |             DummySampleConfig(sample_key="input_ids", sample_shape=(512,), sample_type=DummySampleDataType.INT),
11 |             DummySampleConfig(sample_key="images", sample_shape=(3, 224, 224), sample_type=DummySampleDataType.FLOAT),
12 |         ],
13 |     )
14 |     assert len(dataset) == 50
15 |     sample = next(iter(dataset))
16 |     assert "input_ids" in sample
17 |     assert sample["input_ids"].shape == (512,)
18 |     assert sample["input_ids"].dtype == np.int64
19 |     assert "images" in sample
20 |     assert sample["images"].shape == (3, 224, 224)
21 |     assert sample["images"].dtype == np.float64
22 | 


--------------------------------------------------------------------------------
/tests/dataloader/yaml_configs/skipped_dataloader.yaml:
--------------------------------------------------------------------------------
 1 | # NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model.
 2 | # Therefore, we can place arbitrary values in the settings field.
 3 | 
 4 | settings:  
 5 |   referencing_keys:
 6 |     sample_key: input_ids
 7 |     target_key: target_ids
 8 |   training:
 9 |     local_train_micro_batch_size: 2
10 |     global_num_seen_tokens: 2048
11 |     sequence_length: 256
12 |   cuda_env:
13 |     global_rank: 0
14 |     world_size: 2
15 | 
16 | collate_fn:  
17 |   component_key: collate_fn
18 |   variant_key: gpt_2_llm_collator
19 |   config:
20 |     sample_key: ${settings.referencing_keys.sample_key}
21 |     target_key: ${settings.referencing_keys.target_key}
22 | 
23 | train_dataset:
24 |   component_key: dataset
25 |   variant_key: packed_mem_map_dataset_continuous
26 |   config:
27 |     raw_data_path: ./data/lorem_ipsum.pbin
28 |     sequence_length: ${settings.training.sequence_length}
29 |     sample_key:  ${settings.referencing_keys.sample_key}
30 | 
31 | skip_num_samples: 
32 |   component_key: number_conversion
33 |   variant_key: num_samples_from_num_tokens
34 |   config:
35 |     num_tokens: ${settings.training.global_num_seen_tokens}
36 |     sequence_length: ${settings.training.sequence_length}
37 | 
38 | train_dataloader:
39 |   component_key: data_loader
40 |   variant_key: default
41 |   config:
42 |     num_workers: 2
43 |     pin_memory: true
44 |     dataloader_tag: train
45 |     dataset:
46 |       instance_key: train_dataset
47 |       pass_type: BY_REFERENCE
48 |     batch_sampler:
49 |       component_key: batch_sampler
50 |       variant_key: default
51 |       config:
52 |         batch_size: ${settings.training.local_train_micro_batch_size}
53 |         drop_last: true
54 |         sampler:
55 |           component_key: sampler
56 |           variant_key: resumable_distributed_sampler
57 |           config:
58 |             dataset:
59 |               instance_key: train_dataset
60 |               pass_type: BY_REFERENCE
61 |             rank: ${settings.cuda_env.global_rank}
62 |             num_replicas: ${settings.cuda_env.world_size}
63 |             shuffle: false
64 |             drop_last: true
65 |             skip_num_global_samples: ${skip_num_samples}
66 |     collate_fn:
67 |       instance_key: collate_fn
68 |       pass_type: BY_REFERENCE
69 | 


--------------------------------------------------------------------------------
/tests/end2end_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/__init__.py


--------------------------------------------------------------------------------
/tests/end2end_tests/custom_components.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from modalities.batch import EvaluationResultBatch
 7 | from modalities.config.config import ProcessGroupBackendType
 8 | from modalities.logging_broker.messages import Message
 9 | from modalities.logging_broker.subscriber import MessageSubscriberIF
10 | from modalities.running_env.cuda_env import CudaEnv
11 | 
12 | 
13 | class SaveAllResultSubscriber(MessageSubscriberIF[EvaluationResultBatch]):
14 |     def __init__(self):
15 |         self.message_list: list[Message[EvaluationResultBatch]] = []
16 | 
17 |     def consume_message(self, message: Message[EvaluationResultBatch]):
18 |         """Consumes a message from a message broker."""
19 |         self.message_list.append(message)
20 | 
21 |     def consume_dict(self, message_dict: dict[str, Any]):
22 |         pass
23 | 
24 | 
25 | class SaveAllResultSubscriberConfig(BaseModel):
26 |     pass
27 | 
28 | 
29 | class MultiProcessingCudaEnv(CudaEnv):
30 |     """Context manager to set the CUDA environment for distributed training."""
31 | 
32 |     def __init__(
33 |         self,
34 |         process_group_backend: ProcessGroupBackendType,
35 |         global_rank: int,
36 |         local_rank: int,
37 |         world_size: int,
38 |         rdvz_port: int,
39 |     ) -> None:
40 |         super().__init__(process_group_backend=process_group_backend)
41 |         self.global_rank = global_rank
42 |         self.local_rank = local_rank
43 |         self.world_size = world_size
44 |         self.rdvz_port = rdvz_port
45 |         self._original_env: dict[str, Optional[str]] = {}
46 | 
47 |     def __enter__(self):
48 |         # Store original values
49 |         for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "LOCAL_RANK", "WORLD_SIZE"]:
50 |             self._original_env[key] = os.environ.get(key)
51 | 
52 |         # Set new environment variables
53 |         os.environ["MASTER_ADDR"] = "localhost"
54 |         os.environ["MASTER_PORT"] = str(self.rdvz_port)
55 |         os.environ["RANK"] = str(self.global_rank)
56 |         os.environ["LOCAL_RANK"] = str(self.local_rank)
57 |         os.environ["WORLD_SIZE"] = str(self.world_size)
58 | 
59 |         # Initialize CUDA environment
60 |         super().__enter__()
61 |         return self
62 | 
63 |     def __exit__(self, exc_type, exc_val, exc_tb):
64 |         # Restore original environment variables
65 |         for key, value in self._original_env.items():
66 |             if value is None:
67 |                 os.environ.pop(key, None)
68 |             else:
69 |                 os.environ[key] = value
70 |         super().__exit__(exc_type, exc_val, exc_tb)
71 | 


--------------------------------------------------------------------------------
/tests/end2end_tests/lorem_ipsum.pbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/lorem_ipsum.pbin


--------------------------------------------------------------------------------
/tests/end2end_tests/system_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/system_tests/__init__.py


--------------------------------------------------------------------------------
/tests/end2end_tests/test_shuffle_jsonl_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | from modalities.api import FileExistencePolicy, shuffle_jsonl_data
 8 | 
 9 | 
10 | @pytest.fixture
11 | def data_rows() -> list[dict]:
12 |     return [
13 |         {"file_id": "file_0.jsonl", "doc_id": "0"},
14 |         {"file_id": "file_0.jsonl", "doc_id": "1"},
15 |         {"file_id": "file_0.jsonl", "doc_id": "2"},
16 |         {"file_id": "file_0.jsonl", "doc_id": "3"},
17 |         {"file_id": "file_0.jsonl", "doc_id": "4"},
18 |         {"file_id": "file_0.jsonl", "doc_id": "5"},
19 |         {"file_id": "file_0.jsonl", "doc_id": "6"},
20 |         {"file_id": "file_0.jsonl", "doc_id": "7"},
21 |         {"file_id": "file_0.jsonl", "doc_id": "8"},
22 |     ]
23 | 
24 | 
25 | @pytest.fixture
26 | def input_data_path(data_rows: list[dict], tmp_path) -> Path:
27 |     with open(tmp_path / "input.jsonl", "w", encoding="utf-8") as f:
28 |         for row in data_rows:
29 |             json.dump(row, f)
30 |             f.write("\n")
31 |             f.flush()
32 |     return Path(f.name)
33 | 
34 | 
35 | @pytest.mark.parametrize(
36 |     "output_data_folder_path, file_existence_policy, seed",
37 |     [
38 |         (Path(tempfile.mkdtemp()), FileExistencePolicy.ERROR, 42),
39 |     ],
40 | )
41 | def test_shuffle_jsonl_data(
42 |     data_rows: list[dict],
43 |     input_data_path: Path,
44 |     output_data_folder_path: Path,
45 |     file_existence_policy: FileExistencePolicy,
46 |     seed: int,
47 | ):
48 |     output_data_path = output_data_folder_path / "output.jsonl"
49 |     shuffle_jsonl_data(
50 |         input_data_path=input_data_path,
51 |         output_data_path=output_data_path,
52 |         file_existence_policy=file_existence_policy,
53 |         seed=seed,
54 |     )
55 | 
56 |     with output_data_path.open("r", encoding="utf-8") as f:
57 |         lines = f.readlines()
58 |     rows_dict_shuffled = [json.loads(line) for line in lines]
59 | 
60 |     # Check that the shuffled data contains the same rows as the input data
61 |     assert len(data_rows) > 0
62 |     assert len(data_rows) == len(rows_dict_shuffled)
63 |     assert any([row != row_shuffled for row, row_shuffled in zip(data_rows, rows_dict_shuffled)])
64 |     assert set([json.dumps(d) for d in data_rows]) == set([json.dumps(d) for d in rows_dict_shuffled])
65 | 


--------------------------------------------------------------------------------
/tests/end2end_tests/test_shuffle_tokenized_data.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | from modalities.api import FileExistencePolicy, shuffle_tokenized_data
 8 | from modalities.dataloader.dataset import PackedMemMapDatasetBase
 9 | 
10 | 
11 | def _calculate_md5(file_path: Path):
12 |     hash_md5 = hashlib.md5()
13 |     with open(file_path, "rb") as f:
14 |         for chunk in iter(lambda: f.read(4096), b""):
15 |             hash_md5.update(chunk)
16 |     return hash_md5.hexdigest()
17 | 
18 | 
19 | @pytest.mark.parametrize(
20 |     "tokenized_data_file_path, batch_size",
21 |     [
22 |         (Path("tests/end2end_tests/lorem_ipsum.pbin"), 2),
23 |     ],
24 | )
25 | def test_shuffle_tokenized_data(tokenized_data_file_path: Path, batch_size: int):
26 |     # temporary file
27 |     md5sums = []
28 |     seeds = [1, 1, 2]
29 |     file_paths = []
30 |     datasets = []
31 |     with tempfile.TemporaryDirectory() as temp_dir:
32 |         for i in range(3):
33 |             temp_file = Path(temp_dir) / f"shuffled_data_{i}.pbin"
34 |             file_paths.append(temp_file)
35 |             shuffle_tokenized_data(
36 |                 tokenized_data_file_path,
37 |                 output_data_path=temp_file,
38 |                 batch_size=batch_size,
39 |                 file_existence_policy=FileExistencePolicy.OVERRIDE,
40 |                 seed=seeds[i],
41 |             )
42 |             md5sums.append(_calculate_md5(temp_file))
43 |             datasets.append(PackedMemMapDatasetBase(raw_data_path=temp_file, sample_key="text", load_index=True))
44 | 
45 |         # check that the different seeds lead to different orderings
46 |         # and that the same seed leads to the same ordering
47 |         assert md5sums[0] == md5sums[1]
48 |         assert md5sums[0] != md5sums[2]
49 | 
50 |         assert len(datasets[0]) == len(datasets[1]) == len(datasets[2])
51 |         for i in range(len(datasets[0])):
52 |             assert all(datasets[0][i]["text"] == datasets[1][i]["text"])
53 | 
54 |         # when we shuffle some lines might end up in the same place
55 |         # in this test we make sure that at least one line is at a different place
56 |         num_differing_lines = 0
57 |         for i in range(len(datasets[0])):
58 |             if len(datasets[0][i]["text"]) == len(datasets[2][i]["text"]):
59 |                 num_differing_lines += int(any(datasets[0][i]["text"] != datasets[2][i]["text"]))
60 |             else:
61 |                 num_differing_lines += 1
62 |         assert num_differing_lines > 0
63 | 


--------------------------------------------------------------------------------
/tests/fsdp2_parallelization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/fsdp2_parallelization/__init__.py


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/coca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/coca/__init__.py


--------------------------------------------------------------------------------
/tests/models/coca/coca_config.yaml:
--------------------------------------------------------------------------------
 1 | prediction_key: logits
 2 | vision_embd_prediction_key: vision_embeddings
 3 | text_embd_prediction_key: text_embeddings
 4 | vision_cls_prediction_key: vision_cls
 5 | text_cls_prediction_key: text_cls
 6 | vision_encoder_config:
 7 |   sample_key: images
 8 |   prediction_key: vision_embeddings
 9 |   img_size: 224
10 |   n_classes: Null # Disable vision transformer head
11 |   n_layer: 6
12 |   attention_config:
13 |     attention_engine_type: pytorch_flash_attention
14 |   n_head: 8
15 |   n_embd: 768
16 |   dropout: 0.0
17 |   patch_size: 16
18 |   patch_stride: 16
19 |   n_img_channels: 3
20 |   add_cls_token: False
21 |   bias: True
22 | text_decoder_config:
23 |   sample_key: input_ids
24 |   prediction_key: text_embeddings
25 |   block_size: 1024
26 |   vocab_size: 50304
27 |   n_layer_text: 6
28 |   n_layer_multimodal_text: 6
29 |   attention_config:
30 |     attention_engine_type: pytorch_flash_attention
31 |   n_head: 12
32 |   ffn_hidden: 3072
33 |   n_embd: 768
34 |   dropout: 0.0
35 |   bias: true
36 |   activation: swiglu
37 |   epsilon: 1e-5
38 | n_pool_head: 8
39 | n_vision_queries: 256
40 | bias_attn_pool: False
41 | epsilon_attn_pool: 1e-5


--------------------------------------------------------------------------------
/tests/models/coca/test_attention_pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from modalities.models.coca.attention_pooling import AttentionPooling
 4 | 
 5 | 
 6 | def test_attention_pooling_forward():
 7 |     model = AttentionPooling(n_embd=768, n_head=8, bias=False, epsilon=1e-5)
 8 |     dummy_input = torch.randn(1, 256, 768)
 9 |     dummy_queries = torch.randn(1, 257, 768)
10 |     out = model(dummy_queries, dummy_input)
11 |     assert out.shape == (1, 257, 768)
12 | 


--------------------------------------------------------------------------------
/tests/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/components/__init__.py


--------------------------------------------------------------------------------
/tests/models/components/test_layer_norms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from modalities.models.components.layer_norms import RMSLayerNorm
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def rms_layer_norm() -> RMSLayerNorm:
11 |     norm = RMSLayerNorm(ndim=3, epsilon=1e-6)
12 |     weight_tensor = torch.Tensor([1, 2, 3])
13 |     norm.weight = nn.Parameter(weight_tensor)
14 |     norm.bias = nn.Parameter(torch.ones(3))
15 |     return norm
16 | 
17 | 
18 | def test_rms_layer_norm_forward(rms_layer_norm):
19 |     x = torch.Tensor([0.1, 0.2, 0.3])
20 |     output = rms_layer_norm(x)
21 |     ref_x = x / np.sqrt((0.1**2 + 0.2**2 + 0.3**2) / 3 + 1e-6)
22 |     ref_tensor = ref_x * rms_layer_norm.weight + torch.tensor([1, 1, 1])
23 | 
24 |     assert output.shape == x.shape
25 |     assert all(output == ref_tensor)
26 | 


--------------------------------------------------------------------------------
/tests/models/test_hf_adapter.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapterConfig
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def hf_model_adapter_config() -> HFModelAdapterConfig:
10 |     return HFModelAdapterConfig(config={})
11 | 
12 | 
13 | def test_convert_posixpath_to_str(hf_model_adapter_config: HFModelAdapterConfig):
14 |     test_data_to_be_formatted = {
15 |         "key1": Path("test/path/1"),
16 |         "key2": [
17 |             {"key211": Path("test/path/211"), "key212": 1},
18 |             {"key221": 1, "key222": Path("test/path/222")},
19 |         ],
20 |         "key3": 1,
21 |     }
22 |     expected_result = {
23 |         "key1": "test/path/1",
24 |         "key2": [
25 |             {"key211": "test/path/211", "key212": 1},
26 |             {"key221": 1, "key222": "test/path/222"},
27 |         ],
28 |         "key3": 1,
29 |     }
30 |     result = hf_model_adapter_config._convert_posixpath_to_str(test_data_to_be_formatted)
31 |     assert result == expected_result
32 | 


--------------------------------------------------------------------------------
/tests/models/test_model_factory.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from modalities.exceptions import ModelStateError
 6 | from modalities.models.model_factory import ModelFactory
 7 | 
 8 | 
 9 | class AllMetaDeviceModel(nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.linear = nn.Linear(4, 2, device="meta")
13 |         self.register_buffer("buffer", torch.empty(1, device="meta"))
14 | 
15 | 
16 | class AllRealDeviceModel(nn.Module):
17 |     def __init__(self):
18 |         super().__init__()
19 |         self.linear = nn.Linear(4, 2)
20 |         self.register_buffer("buffer", torch.empty(1))
21 | 
22 | 
23 | class MixedDeviceModel(nn.Module):
24 |     def __init__(self):
25 |         super().__init__()
26 |         self.linear = nn.Linear(4, 2, device="meta")
27 |         self.register_buffer("buffer", torch.empty(1))  # Not on meta device
28 | 
29 | 
30 | def test_is_model_on_meta_device_true():
31 |     model = AllMetaDeviceModel()
32 |     assert ModelFactory._is_model_on_meta_device(model)
33 | 
34 | 
35 | def test_is_model_on_meta_device_false():
36 |     model = AllRealDeviceModel()
37 |     assert not ModelFactory._is_model_on_meta_device(model)
38 | 
39 | 
40 | def test_is_model_on_meta_device_mixed_raises():
41 |     model = MixedDeviceModel()
42 |     with pytest.raises(ModelStateError):
43 |         ModelFactory._is_model_on_meta_device(model)
44 | 


--------------------------------------------------------------------------------
/tests/models/vision_transformer/test_vision_transformer.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from modalities.__main__ import load_app_config_dict
 7 | from modalities.models.vision_transformer.vision_transformer_model import VisionTransformer, VisionTransformerConfig
 8 | from tests.conftest import _ROOT_DIR
 9 | 
10 | 
11 | def test_vision_transformer():
12 |     # Create model
13 |     config_file_path = _ROOT_DIR / Path("tests/models/vision_transformer/vision_transformer_config.yaml")
14 |     config_dict = load_app_config_dict(config_file_path=config_file_path)
15 |     config = VisionTransformerConfig.model_validate(config_dict)
16 |     model = VisionTransformer(**dict(config))
17 | 
18 |     # Create dummy inputs
19 |     dummy_input_image = torch.randn(1, 3, 224, 224)
20 |     dummy_input = dict(images=dummy_input_image)
21 | 
22 |     # Create optimizer
23 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
24 | 
25 |     # Run one training step
26 |     optimizer.zero_grad()
27 |     out = model(dummy_input)
28 |     loss = out["logits"].sum()
29 |     loss.backward()
30 |     optimizer.step()
31 | 
32 |     # Test outputs
33 |     assert "logits" in out
34 |     assert out["logits"].shape == (1, 1000)
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "img_size,patch_size,patch_stride,add_cls_token,target_block_size",
39 |     [
40 |         ((224, 224), 16, 16, True, 197),
41 |         ((224, 224), 16, 16, False, 196),
42 |         ((224, 112), 16, 16, False, 98),
43 |         ((480, 480), 16, 16, False, 900),
44 |         ((480 + 1, 480 + 1), 16, 16, False, 900),
45 |         ((224, 224), 8, 16, True, 197),
46 |         ((224, 224), 16, 8, True, 730),
47 |         ((224, 224), 8, 8, True, 785),
48 |     ],
49 | )
50 | def test_vision_transformer_block_size(img_size, patch_size, patch_stride, add_cls_token, target_block_size):
51 |     block_size = VisionTransformer._calculate_block_size(img_size, patch_size, patch_stride, add_cls_token)
52 |     assert block_size == target_block_size
53 | 


--------------------------------------------------------------------------------
/tests/models/vision_transformer/vision_transformer_config.yaml:
--------------------------------------------------------------------------------
 1 | sample_key: images
 2 | prediction_key: logits
 3 | img_size: 224
 4 | n_classes: 1000
 5 | n_layer: 6
 6 | n_head: 8
 7 | n_embd: 768
 8 | dropout: 0.0
 9 | patch_size: 16
10 | patch_stride: 16
11 | n_img_channels: 3
12 | add_cls_token: True
13 | bias: True
14 | 


--------------------------------------------------------------------------------
/tests/nn/test_attention.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from modalities.nn.attention import AttentionType, MultiHeadAttention
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "attention_type", [AttentionType.CAUSAL_SELF_ATTENTION, AttentionType.NON_CAUSAL_SELF_ATTENTION]
 9 | )
10 | def test_attention_forward(attention_type):
11 |     model = MultiHeadAttention(n_embd=64, n_head=8, attention_type=attention_type)
12 |     dummy_input = torch.randn(1, 256, 64)
13 |     out = model(dummy_input)
14 |     assert out.shape == (1, 256, 64)
15 | 
16 | 
17 | def test_attention_with_cross_attention_forward():
18 |     model = MultiHeadAttention(n_embd=64, n_head=8, attention_type=AttentionType.CROSS_ATTENTION)
19 |     dummy_input = torch.randn(1, 256, 64)
20 |     dummy_context = torch.randn(1, 16, 64)
21 |     out = model(dummy_input, context=dummy_context)
22 |     assert out.shape == (1, 256, 64)
23 | 


--------------------------------------------------------------------------------
/tests/nn/test_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from modalities.models.model import SwiGLU
 5 | from modalities.nn.mlp import MLP
 6 | 
 7 | 
 8 | def test_mlp_forward():
 9 |     model = MLP(in_features=64, hidden_features=256)
10 |     dummy_input = torch.randn(1, 10, 64)
11 |     out = model(dummy_input)
12 |     assert out.shape == (1, 10, 64)
13 | 
14 | 
15 | def test_SwiGLU_forward():
16 |     n_embd = 512
17 |     ffn_hidden = 4 * n_embd
18 |     bias = True
19 |     mlp = SwiGLU(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias)
20 | 
21 |     hidden_dim = 1536
22 |     assert SwiGLU._get_hidden_dim(ffn_hidden=ffn_hidden) == hidden_dim
23 | 
24 |     n_embd = 511
25 |     ffn_hidden = 4 * n_embd
26 |     assert SwiGLU._get_hidden_dim(ffn_hidden=ffn_hidden) == hidden_dim
27 | 
28 |     n_embd = 512
29 | 
30 |     # batch size x sequence length x embedding dim
31 |     input_tensor = torch.randn(1, 1, n_embd)
32 |     output_tensor = mlp(input_tensor)
33 |     assert output_tensor.shape == (1, 1, n_embd)
34 | 
35 |     W = nn.Linear(in_features=n_embd, out_features=hidden_dim, bias=bias)
36 |     V = nn.Linear(in_features=n_embd, out_features=hidden_dim, bias=bias)
37 |     W_2 = nn.Linear(in_features=hidden_dim, out_features=n_embd, bias=bias)
38 |     silu = nn.SiLU()
39 |     mlp.W = W
40 |     mlp.V = V
41 |     mlp.W_2 = W_2
42 | 
43 |     output_tensor = mlp(input_tensor)
44 |     assert torch.all(output_tensor == W_2(silu(W(input_tensor)) * V(input_tensor)))
45 | 


--------------------------------------------------------------------------------
/tests/run_all_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #######################
 4 | ### INPUT ARGUMENTS ###
 5 | #######################
 6 | if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
 7 |   then
 8 |     echo "Need to specify 2 GPU devices as arguments, e.g. bash run_distributed_tests.sh 0 1"
 9 |     exit
10 | fi
11 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]]  # if one of the two input arguments is not an integer 0-7
12 |     then
13 |         echo "Need to specify integers 0-7 as arguments, e.g. bash run_distributed_tests.sh 0 1"
14 |         exit 
15 | fi
16 | 
17 | #################
18 | ### VARIABLES ###
19 | #################
20 | DEV0=$1 
21 | DEV1=$2
22 | 
23 | #############
24 | ### TESTS ###
25 | #############
26 | 
27 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
28 | cd "$SCRIPT_DIR/.."
29 | 
30 | 
31 | mkdir -p .coverage_reports
32 | rm -f .coverage_reports/*
33 | 
34 | COVERAGE_FILE=.coverage_reports/.coverage.part0 python -m pytest tests/
35 | 
36 | sh tests/run_distributed_tests.sh $DEV0 $DEV1 --cov
37 | 
38 | # combine test coverage reports
39 | cd .coverage_reports
40 | coverage combine --keep
41 | coverage report
42 | 


--------------------------------------------------------------------------------
/tests/test_evaluator.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import call
 2 | 
 3 | import torch
 4 | 
 5 | from modalities.batch import DatasetBatch
 6 | from modalities.evaluator import Evaluator
 7 | 
 8 | 
 9 | def test_evaluate_cpu(
10 |     monkeypatch, nn_model_mock, loss_mock, llm_data_loader_mock, progress_publisher_mock, set_env_cpu
11 | ):
12 |     batch_size = 32
13 |     seq_len = 64
14 |     num_batches = 4
15 |     sample_key = "input_ids"
16 |     target_key = "target_ids"
17 | 
18 |     sample_tensor = torch.randint(size=(batch_size, seq_len), low=1, high=100)
19 |     samples = {sample_key: sample_tensor[:, :-1]}
20 |     targets = {target_key: sample_tensor[:, 1:]}
21 | 
22 |     batches = [DatasetBatch(targets=targets, samples=samples) for _ in range(num_batches)]
23 | 
24 |     llm_data_loader_mock.__iter__ = lambda _: iter(batches)
25 |     llm_data_loader_mock.batch_size = batch_size
26 | 
27 |     evaluator = Evaluator(
28 |         progress_publisher=progress_publisher_mock,
29 |         evaluation_result_publisher=progress_publisher_mock,
30 |     )
31 | 
32 |     evaluator.evaluate(
33 |         model=nn_model_mock, data_loaders=[llm_data_loader_mock], loss_fun=loss_mock, num_train_steps_done=1
34 |     )
35 |     nn_model_mock.assert_has_calls([call(b.samples) for b in batches])
36 | 


--------------------------------------------------------------------------------
/tests/test_gym.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import call
 2 | 
 3 | from modalities.gym import Gym
 4 | from tests.test_utils import configure_dataloader_mock
 5 | 
 6 | 
 7 | def test_run_cpu_only(
 8 |     monkeypatch,
 9 |     checkpoint_saving_mock,
10 |     evaluator_mock,
11 |     app_state_mock,
12 |     loss_mock,
13 |     llm_data_loader_mock,
14 |     set_env_cpu,
15 |     trainer,
16 | ):
17 |     num_batches = 4
18 |     num_ranks = 1
19 | 
20 |     llm_data_loader_mock, batches = configure_dataloader_mock(
21 |         batch_size=32,
22 |         seq_len=64,
23 |         num_batches=num_batches,
24 |         sample_key="input_ids",
25 |         target_key="target_ids",
26 |         llm_data_loader_mock=llm_data_loader_mock,
27 |     )
28 | 
29 |     gym = Gym(trainer=trainer, evaluator=evaluator_mock, loss_fun=loss_mock, num_ranks=num_ranks)
30 |     gym.run(
31 |         app_state=app_state_mock,
32 |         training_log_interval_in_steps=1,
33 |         checkpointing_interval_in_steps=1,
34 |         evaluation_interval_in_steps=1,
35 |         train_data_loader=llm_data_loader_mock,
36 |         evaluation_data_loaders=[],
37 |         checkpoint_saving=checkpoint_saving_mock,
38 |     )
39 |     app_state_mock.model.assert_has_calls([call(b.samples) for b in batches])
40 |     app_state_mock.optimizer.step.assert_called()
41 | 


--------------------------------------------------------------------------------
/tests/test_loss_functions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from modalities.batch import InferenceResultBatch
 5 | from modalities.loss_functions import NCELoss, nce_loss
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def dummy_result_batch() -> InferenceResultBatch:
10 |     predictions = {"embedding": torch.rand(1024, 512)}
11 |     targets = {"target": torch.zeros(1024, 512)}
12 |     batch_dim = 1024
13 |     result_batch = InferenceResultBatch(targets, predictions, batch_dim)
14 |     return result_batch
15 | 
16 | 
17 | # calculating asymmetric NCELoss between a batch of embeddings and itself --> zero
18 | @pytest.mark.parametrize("key", ["embedding"])
19 | def test_asymm_NCELoss_is_zero(dummy_result_batch, key):
20 |     loss_func = NCELoss(prediction_key1=key, prediction_key2=key)
21 |     assert loss_func(dummy_result_batch) <= 10e-6
22 | 
23 | 
24 | # calculating nce_loss for two randomly generated batch of embeddings (manually calculated)
25 | @pytest.mark.parametrize(
26 |     "embedding1,embedding2",
27 |     [
28 |         (
29 |             torch.Tensor([[0.38, 0.18], [0.36, 0.66], [0.72, 0.09]]),
30 |             torch.Tensor([[0.48, 0.01], [0.54, 0.28], [0.08, 0.34]]),
31 |         )
32 |     ],
33 | )
34 | def test_nce_loss_correctness(embedding1, embedding2):
35 |     unidirectional_loss = nce_loss(embedding1, embedding2, device="cpu", is_asymmetric=True, temperature=1.0)
36 |     bidirectional_loss = nce_loss(embedding1, embedding2, device="cpu", is_asymmetric=False, temperature=1.0)
37 |     assert unidirectional_loss == pytest.approx(1.1300, 0.0001)
38 |     assert bidirectional_loss == pytest.approx(2.2577, 0.0001)
39 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch.cuda
 3 | 
 4 | from modalities.__main__ import Main
 5 | from modalities.config.config import ProcessGroupBackendType
 6 | from modalities.config.instantiation_models import TrainingComponentsInstantiationModel
 7 | from modalities.running_env.cuda_env import CudaEnv
 8 | 
 9 | 
10 | @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This e2e test requires 1 GPU.")
11 | def test_e2e_training_run_wout_ckpt(monkeypatch, dummy_config, dummy_config_path):
12 |     # patch in env variables
13 |     monkeypatch.setenv("MASTER_ADDR", "localhost")
14 |     monkeypatch.setenv("MASTER_PORT", "9948")
15 | 
16 |     with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
17 |         main = Main(dummy_config_path)
18 |         main.config_dict = dummy_config
19 |         components = main.build_components(components_model_type=TrainingComponentsInstantiationModel)
20 |         main.run(components)
21 | 


--------------------------------------------------------------------------------
/tests/test_rotary_qkv_transform.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from modalities.models.gpt2.gpt2_model import RotaryTransform
 4 | 
 5 | 
 6 | def test_rotary_transform():
 7 |     bs = 1
 8 |     n_heads = 2
 9 |     embedding_dim = 8
10 |     seq_lenght = 2
11 |     head_dim = embedding_dim // n_heads
12 | 
13 |     q = torch.ones(bs, n_heads, seq_lenght, head_dim) + 1
14 |     q[:, :, :, head_dim // 2 :] = q[:, :, :, head_dim // 2 :] + 1
15 |     k = torch.ones(bs, n_heads, seq_lenght, head_dim) + 2
16 |     k[:, :, :, head_dim // 2 :] = k[:, :, :, head_dim // 2 :] + 1
17 |     v = torch.ones(bs, n_heads, seq_lenght, head_dim)
18 | 
19 |     rotary_transform = RotaryTransform(n_embd=embedding_dim, n_head=n_heads)
20 | 
21 |     q_rot, k_rot, v_rot = rotary_transform(q=q, k=k, v=v)
22 | 
23 |     assert torch.equal(v, v_rot)
24 |     assert v.shape == v_rot.shape
25 | 
26 |     theta = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
27 | 
28 |     m = torch.tensor([0, 1]).view(2, 1)
29 |     theta_0 = theta[0]
30 |     theta_1 = theta[1]
31 |     theta = torch.tensor([theta_0, theta_1, theta_0, theta_1]).view(1, 4)
32 |     m_theta = m * theta
33 | 
34 |     cos_m_theta = m_theta.cos()
35 |     sin_m_theta = m_theta.sin()
36 | 
37 |     for comp, comp_rot in zip([q, k], [q_rot, k_rot]):
38 |         assert not torch.equal(comp, comp_rot)
39 |         assert comp.shape == comp_rot.shape
40 |         comp_h_1, comp_h_2 = comp.chunk(2, dim=-1)
41 |         comp_rot_h = torch.cat([-comp_h_2, comp_h_1], dim=-1)
42 |         comp_rot_expected = comp * cos_m_theta + comp_rot_h * sin_m_theta
43 |         assert torch.equal(comp_rot_expected, comp_rot)
44 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import debugpy
 4 | import torch
 5 | 
 6 | from modalities.batch import DatasetBatch
 7 | from modalities.util import get_local_number_of_trainable_parameters
 8 | 
 9 | 
10 | def add_debugger_to_distributed_test():
11 |     """Add a debugger to a distributed test.
12 |     This function should be called at the beginning of the test.
13 | 
14 |     Within VScode you can use the following configuration to attach the debugger to the test:
15 | 
16 |     ```json
17 |     {
18 |         "name": "Test Torch Distributed",
19 |         "type": "python",
20 |         "request": "launch",
21 |         "program": "path/to/torchrun",
22 |         "console": "integratedTerminal",
23 |         "env": {"CUDA_VISIBLE_DEVICES": "0,1"},
24 |         "args": ["--rdzv-endpoint", "localhost:29833", "--nnodes", "1",
25 |                 "--nproc_per_node", "2", "path/to/pytest", "tests/some_test.py"],
26 |         "justMyCode": false,
27 |     },
28 |     ```
29 |     """
30 |     # Get the rank of the process (0 or 1 in this case)
31 |     rank = int(os.getenv("RANK"))
32 | 
33 |     # Use a different port for each process
34 |     port = 9875 + rank
35 |     debugpy.listen(("0.0.0.0", port))  # Listening on all interfaces to allow debugger to attach
36 |     print(f"Rank {rank}: Waiting for debugger to attach on port {port}...")
37 |     debugpy.wait_for_client()  # Pause here until the debugger attaches
38 | 
39 | 
40 | def configure_dataloader_mock(
41 |     batch_size: int,
42 |     seq_len: int,
43 |     num_batches: int,
44 |     sample_key: str,
45 |     target_key: str,
46 |     llm_data_loader_mock,
47 | ):
48 |     sample_tensor = torch.randint(size=(batch_size, seq_len), low=1, high=100)
49 |     samples = {sample_key: sample_tensor[:, :-1]}
50 |     targets = {target_key: sample_tensor[:, 1:]}
51 | 
52 |     batches = [DatasetBatch(targets=targets, samples=samples) for _ in range(num_batches)]
53 | 
54 |     llm_data_loader_mock.__iter__ = lambda _: iter(batches)
55 |     llm_data_loader_mock.batch_size = batch_size
56 |     llm_data_loader_mock.fast_forward_batch_id = 0
57 |     llm_data_loader_mock.__len__ = lambda _: num_batches
58 | 
59 |     return llm_data_loader_mock, batches
60 | 
61 | 
62 | def test_get_local_number_of_trainable_parameters():
63 |     # Create a simple model with trainable parameters
64 |     model = torch.nn.Sequential(torch.nn.Linear(10, 5), torch.nn.ReLU(), torch.nn.Linear(5, 2))
65 | 
66 |     # Calculate the expected number of trainable parameters
67 |     expected_params = 10 * 5 + 5 + 5 * 2 + 2  # weights_1 + bias_1 + weights_2 + bias_2 = 67
68 | 
69 |     # Call the function and check the result
70 |     assert get_local_number_of_trainable_parameters(model) == expected_params
71 | 


--------------------------------------------------------------------------------
/tests/test_yaml_configs/coca_config_initialization.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: model_raw
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: coca
13 |         weight_init_type: WILL_BE_REPLACED
14 |         mean: 0.0
15 |         std: WILL_BE_REPLACED
16 | 
17 | model_raw:
18 |   component_key: model
19 |   variant_key: coca
20 |   config:
21 |     prediction_key: logits
22 |     vision_embd_prediction_key: vision_embeddings
23 |     text_embd_prediction_key: text_embeddings
24 |     vision_cls_prediction_key: vision_cls
25 |     text_cls_prediction_key: text_cls
26 |     vision_encoder_config:
27 |       sample_key: images
28 |       prediction_key: vision_embeddings
29 |       img_size: 224
30 |       n_classes: Null # Disable vision transformer head
31 |       n_layer: 6
32 |       attention_config:
33 |         attention_engine_type: default_attention
34 |       n_head: 8
35 |       n_embd: 768
36 |       dropout: 0.0
37 |       patch_size: 16
38 |       patch_stride: 16
39 |       n_img_channels: 3
40 |       add_cls_token: False
41 |       bias: True
42 |     text_decoder_config:
43 |       sample_key: input_ids
44 |       prediction_key: logits
45 |       block_size: 1024
46 |       vocab_size: 50304
47 |       n_layer_text: 6
48 |       n_layer_multimodal_text: 6
49 |       attention_config:
50 |         attention_engine_type: default_attention
51 |       n_head: 12
52 |       ffn_hidden: 3072
53 |       n_embd: 768
54 |       dropout: 0.0
55 |       bias: true
56 |       activation: swiglu
57 |       epsilon: 1e-5
58 |     n_pool_head: 8
59 |     n_vision_queries: 256
60 |     bias_attn_pool: False
61 |     epsilon_attn_pool: 1e-5


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_initialization.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: model_raw
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: gpt2
13 |         weight_init_type: WILL_BE_REPLACED
14 |         mean: 0.0
15 |         std: WILL_BE_REPLACED
16 |         hidden_dim: ${model_raw.config.n_embd}
17 |         num_layers: ${model_raw.config.n_layer}
18 | 
19 | model_raw:
20 |   component_key: model
21 |   variant_key: gpt2
22 |   config:
23 |     sample_key: "input_ids"
24 |     poe_type: ABSOLUTE
25 |     prediction_key: "logits"
26 |     sequence_length: 2048
27 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
28 |     n_layer: 12
29 |     n_head_q: 12
30 |     n_head_kv: 12
31 |     ffn_hidden: 2048
32 |     n_embd: 768
33 |     dropout: 0.0
34 |     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
35 |     attention_config:
36 |       qkv_transforms: []
37 |     attention_implementation: manual
38 |     activation_type: gelu
39 |     attention_norm_config:
40 |       norm_type: rms_norm
41 |       config:
42 |         ndim: ${model_raw.config.n_embd}
43 |         bias: true
44 |         epsilon: 1e-5
45 |     ffn_norm_config:
46 |       norm_type: rms_norm
47 |       config:
48 |         ndim: ${model_raw.config.n_embd}
49 |         bias: true
50 |         epsilon: 1e-5
51 |     lm_head_norm_config:
52 |       norm_type: rms_norm
53 |       config:
54 |         ndim: ${model_raw.config.n_embd}
55 |         bias: true
56 |         epsilon: 1e-5
57 |     use_weight_tying: WILL_BE_REPLACED


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_initialization_fsdp1.yaml:
--------------------------------------------------------------------------------
 1 | tested_model:
 2 |   component_key: model
 3 |   variant_key: fsdp1_wrapped
 4 |   config:
 5 |     model:
 6 |       instance_key: initialized_model
 7 |       pass_type: BY_REFERENCE
 8 |     sync_module_states: true
 9 |     mixed_precision_settings: BF_16
10 |     sharding_strategy: FULL_SHARD
11 |     block_names: [GPT2Block]
12 | 
13 | initialized_model: 
14 |   component_key: model
15 |   variant_key: model_initialized
16 |   config:
17 |     model:
18 |       instance_key: model_raw
19 |       pass_type: BY_REFERENCE
20 |     model_initializer:
21 |       component_key: model_initialization
22 |       variant_key: composed
23 |       config:
24 |         model_type: gpt2
25 |         weight_init_type: WILL_BE_REPLACED
26 |         mean: 0.0
27 |         std: WILL_BE_REPLACED
28 |         hidden_dim: ${model_raw.config.n_embd}
29 |         num_layers: ${model_raw.config.n_layer}
30 | 
31 | model_raw:
32 |   component_key: model
33 |   variant_key: gpt2
34 |   config:
35 |     sample_key: "input_ids"
36 |     poe_type: ABSOLUTE
37 |     prediction_key: "logits"
38 |     sequence_length: 2048
39 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
40 |     n_layer: 12
41 |     n_head_q: 12
42 |     n_head_kv: 12
43 |     ffn_hidden: 2048
44 |     n_embd: 768
45 |     dropout: 0.0
46 |     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
47 |     attention_config:
48 |       qkv_transforms: []
49 |     attention_implementation: manual
50 |     activation_type: gelu
51 |     attention_norm_config:
52 |       norm_type: rms_norm
53 |       config:
54 |         ndim: ${model_raw.config.n_embd}
55 |         bias: true
56 |         epsilon: 1e-5
57 |     ffn_norm_config:
58 |       norm_type: rms_norm
59 |       config:
60 |         ndim: ${model_raw.config.n_embd}
61 |         bias: true
62 |         epsilon: 1e-5
63 |     lm_head_norm_config:
64 |       norm_type: rms_norm
65 |       config:
66 |         ndim: ${model_raw.config.n_embd}
67 |         bias: true
68 |         epsilon: 1e-5
69 |     use_weight_tying: WILL_BE_REPLACED


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_initialization_fsdp2.yaml:
--------------------------------------------------------------------------------
 1 | tested_model: 
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: fsdp_model
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: gpt2
13 |         weight_init_type: WILL_BE_REPLACED
14 |         mean: 0.0
15 |         std:  WILL_BE_REPLACED
16 |         hidden_dim: ${model_raw.config.n_embd}
17 |         num_layers: ${model_raw.config.n_layer}
18 | 
19 | fsdp_model:
20 |   component_key: model
21 |   variant_key: fsdp2_wrapped
22 |   config:
23 |     model:
24 |       instance_key: model_raw
25 |       pass_type: BY_REFERENCE
26 |     device_mesh:
27 |       instance_key: device_mesh
28 |       pass_type: BY_REFERENCE
29 |     mixed_precision_settings:
30 |       param_dtype: BF_16
31 |       reduce_dtype: BF_16
32 |     block_names: [GPT2Block]
33 | 
34 | model_raw:
35 |   component_key: model
36 |   variant_key: gpt2
37 |   config:
38 |     sample_key: "input_ids"
39 |     poe_type: ABSOLUTE
40 |     prediction_key: "logits"
41 |     sequence_length: 2048
42 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
43 |     n_layer: 12
44 |     n_head_q: 12
45 |     n_head_kv: 12
46 |     ffn_hidden: 2048
47 |     n_embd: 768
48 |     dropout: 0.0
49 |     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
50 |     attention_config:
51 |       qkv_transforms: []
52 |     attention_implementation: manual
53 |     activation_type: gelu
54 |     attention_norm_config:
55 |       norm_type: rms_norm
56 |       config:
57 |         ndim: ${model_raw.config.n_embd}
58 |         bias: true
59 |         epsilon: 1e-5
60 |     ffn_norm_config:
61 |       norm_type: rms_norm
62 |       config:
63 |         ndim: ${model_raw.config.n_embd}
64 |         bias: true
65 |         epsilon: 1e-5
66 |     lm_head_norm_config:
67 |       norm_type: rms_norm
68 |       config:
69 |         ndim: ${model_raw.config.n_embd}
70 |         bias: true
71 |         epsilon: 1e-5
72 |     use_weight_tying: WILL_BE_REPLACED
73 |     use_meta_device: false
74 | 
75 | device_mesh:
76 |   component_key: device_mesh
77 |   variant_key: default
78 |   config:
79 |     device_type: cuda
80 |     data_parallel_replicate_degree: 1
81 |     data_parallel_shard_degree: ${cuda_env:WORLD_SIZE}
82 |     world_size: ${cuda_env:WORLD_SIZE}


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_mfu_fsdp1.yaml:
--------------------------------------------------------------------------------
 1 | test_model:
 2 |   component_key: model
 3 |   variant_key: fsdp1_wrapped
 4 |   config:
 5 |     model:
 6 |       instance_key: model_initialized
 7 |       pass_type: BY_REFERENCE
 8 |     sync_module_states: true
 9 |     mixed_precision_settings: BF_16
10 |     sharding_strategy: FULL_SHARD
11 |     block_names: [GPT2Block]
12 | 
13 | model_initialized:
14 |   component_key: model
15 |   variant_key: model_initialized
16 |   config:
17 |     model:
18 |       instance_key: model_raw
19 |       pass_type: BY_REFERENCE
20 |     model_initializer:
21 |       component_key: model_initialization
22 |       variant_key: composed
23 |       config:
24 |         model_type: gpt2
25 |         weight_init_type: scaled
26 |         mean: 0.0
27 |         std: 0.02
28 |         num_layers: ${model_raw.config.n_layer}
29 | 
30 | model_raw:
31 |   component_key: model
32 |   variant_key: gpt2
33 |   config:
34 |     sample_key: "input_ids"
35 |     poe_type: ABSOLUTE
36 |     prediction_key: "logits"
37 |     sequence_length: 2048
38 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
39 |     n_layer: 12
40 |     n_head_q: 12
41 |     n_head_kv: 12
42 |     ffn_hidden: 3072
43 |     n_embd: 768
44 |     dropout: 0.0
45 |     bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
46 |     attention_config:
47 |       qkv_transforms: []
48 |     attention_implementation: manual
49 |     activation_type: gelu
50 |     attention_norm_config:
51 |       norm_type: rms_norm
52 |       config:
53 |         ndim: ${model_raw.config.n_embd}
54 |         bias: false
55 |         epsilon: 1e-5
56 |     ffn_norm_config:
57 |       norm_type: rms_norm
58 |       config:
59 |         ndim: ${model_raw.config.n_embd}
60 |         bias: false
61 |         epsilon: 1e-5
62 |     lm_head_norm_config:
63 |       norm_type: rms_norm
64 |       config:
65 |         ndim: ${model_raw.config.n_embd}
66 |         bias: false
67 |         epsilon: 1e-5
68 |     use_weight_tying: true
69 |     use_meta_device: false
70 | 
71 | mfu_calculator:
72 |   component_key: mfu_calculator
73 |   variant_key: gpt2
74 |   config:
75 |     n_layer: ${model_raw.config.n_layer}
76 |     sequence_length: ${model_raw.config.sequence_length}
77 |     n_embd: ${model_raw.config.n_embd}
78 |     world_size: ${cuda_env:WORLD_SIZE}
79 |     wrapped_model:
80 |       instance_key: test_model
81 |       pass_type: BY_REFERENCE


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_mfu_fsdp2.yaml:
--------------------------------------------------------------------------------
 1 | test_model:
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: fsdp_model
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: gpt2
13 |         weight_init_type: scaled
14 |         mean: 0.0
15 |         std: 0.02
16 |         num_layers: ${model_raw.config.n_layer}
17 | 
18 | fsdp_model:
19 |   component_key: model
20 |   variant_key: fsdp2_wrapped
21 |   config:
22 |     model:
23 |       instance_key: model_raw
24 |       pass_type: BY_REFERENCE
25 |     device_mesh:
26 |       instance_key: device_mesh
27 |       pass_type: BY_REFERENCE
28 |     mixed_precision_settings:
29 |       param_dtype: BF_16
30 |       reduce_dtype: BF_16
31 |     block_names: [GPT2Block]
32 | 
33 | model_raw:
34 |   component_key: model
35 |   variant_key: gpt2
36 |   config:
37 |     sample_key: "input_ids"
38 |     poe_type: ABSOLUTE
39 |     prediction_key: "logits"
40 |     sequence_length: 2048
41 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
42 |     n_layer: 12
43 |     n_head_q: 12
44 |     n_head_kv: 12
45 |     ffn_hidden: 3072
46 |     n_embd: 768
47 |     dropout: 0.0
48 |     bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
49 |     attention_config:
50 |       qkv_transforms: []
51 |     attention_implementation: manual
52 |     activation_type: gelu
53 |     attention_norm_config:
54 |       norm_type: rms_norm
55 |       config:
56 |         ndim: ${model_raw.config.n_embd}
57 |         bias: false
58 |         epsilon: 1e-5
59 |     ffn_norm_config:
60 |       norm_type: rms_norm
61 |       config:
62 |         ndim: ${model_raw.config.n_embd}
63 |         bias: false
64 |         epsilon: 1e-5
65 |     lm_head_norm_config:
66 |       norm_type: rms_norm
67 |       config:
68 |         ndim: ${model_raw.config.n_embd}
69 |         bias: false
70 |         epsilon: 1e-5
71 |     use_weight_tying: true
72 |     use_meta_device: false
73 | 
74 | device_mesh:
75 |   component_key: device_mesh
76 |   variant_key: default
77 |   config:
78 |     device_type: cuda
79 |     data_parallel_replicate_degree: 1
80 |     data_parallel_shard_degree: ${cuda_env:WORLD_SIZE} # i.e., fully sharded
81 |     world_size: ${cuda_env:WORLD_SIZE}
82 | 
83 | mfu_calculator:
84 |   component_key: mfu_calculator
85 |   variant_key: gpt2
86 |   config:
87 |     n_layer: ${model_raw.config.n_layer}
88 |     sequence_length: ${model_raw.config.sequence_length}
89 |     n_embd: ${model_raw.config.n_embd}
90 |     world_size: ${cuda_env:WORLD_SIZE}
91 |     wrapped_model:
92 |       instance_key: test_model
93 |       pass_type: BY_REFERENCE


--------------------------------------------------------------------------------
/tests/test_yaml_configs/gpt2_config_optimizer.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   component_key: model
 3 |   variant_key: model_initialized
 4 |   config:
 5 |     model:
 6 |       instance_key: model_raw
 7 |       pass_type: BY_REFERENCE
 8 |     model_initializer:
 9 |       component_key: model_initialization
10 |       variant_key: composed
11 |       config:
12 |         model_type: gpt2
13 |         weight_init_type: scaled
14 |         mean: 0.0
15 |         std: 0.02
16 |         num_layers: ${model_raw.config.n_layer}
17 | 
18 | model_raw:
19 |   component_key: model
20 |   variant_key: gpt2
21 |   config:
22 |     sample_key: "input_ids"
23 |     poe_type: ABSOLUTE
24 |     prediction_key: "logits"
25 |     sequence_length: 2048
26 |     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
27 |     n_layer: 12
28 |     n_head_q: 12
29 |     n_head_kv: 12
30 |     ffn_hidden: 2048
31 |     n_embd: 768
32 |     dropout: 0.0
33 |     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
34 |     attention_config:
35 |       qkv_transforms: []
36 |     attention_implementation: manual
37 |     activation_type: gelu
38 |     attention_norm_config:
39 |       norm_type: rms_norm
40 |       config:
41 |         ndim: ${model_raw.config.n_embd}
42 |         bias: true
43 |         epsilon: 1e-5
44 |     ffn_norm_config:
45 |       norm_type: rms_norm
46 |       config:
47 |         ndim: ${model_raw.config.n_embd}
48 |         bias: true
49 |         epsilon: 1e-5
50 |     lm_head_norm_config:
51 |       norm_type: rms_norm
52 |       config:
53 |         ndim: ${model_raw.config.n_embd}
54 |         bias: true
55 |         epsilon: 1e-5
56 |     use_weight_tying: true


--------------------------------------------------------------------------------
/tests/tmp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/tmp/.gitkeep


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_seeding.py:
--------------------------------------------------------------------------------
 1 | from pytest import mark
 2 | 
 3 | from modalities.utils.seeding import calculate_hashed_seed
 4 | 
 5 | 
 6 | @mark.parametrize(
 7 |     "input_data, max_seed",
 8 |     [
 9 |         (["a", "b", "c"], 2**32 - 1),
10 |         (["d", "e", "f"], 2**32 - 1),
11 |         (["g", "hij", "klmnop"], 2**32 - 1),
12 |         (
13 |             [
14 |                 "5d3b0e03a13dff183d4d77bc258bec18",
15 |                 "5d3b0e03a13dff183d4d77bc258bec18",
16 |                 "5d3b0e03a13dff183d4d77bc258bec18",
17 |             ],
18 |             2**32 - 1,
19 |         ),
20 |         (
21 |             [
22 |                 "123b0e03a13dff183d4d77bc258bec18",
23 |                 "456b0e03a13dff183d4d77bc258bec18",
24 |                 "789b0e03a13dff183d4d77bc258bec18",
25 |             ],
26 |             2**32 - 1,
27 |         ),
28 |     ],
29 | )
30 | def test_calculate_seed(input_data: list[str], max_seed: int):
31 |     seed = calculate_hashed_seed(input_data=input_data, max_seed=max_seed)
32 |     print(seed)
33 |     assert seed >= 0
34 |     assert seed < max_seed
35 | 


--------------------------------------------------------------------------------
/tutorials/getting_started/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/getting_started/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/tutorials/getting_started/configs/example_conversion_config_template.yaml:
--------------------------------------------------------------------------------
 1 | tokenizer:
 2 |   component_key: tokenizer
 3 |   variant_key: pretrained_hf_tokenizer
 4 |   config:
 5 |     pretrained_model_name_or_path: tokenizer
 6 |     padding: false
 7 |     truncation: false 
 8 | 
 9 | checkpointed_model:
10 |   component_key: model
11 |   variant_key: fsdp1_checkpointed
12 |   config:
13 |     checkpoint_loading:
14 |       component_key: checkpoint_loading
15 |       variant_key: torch
16 |       config:
17 |         device: cpu
18 |         precision: BF16
19 |     model:
20 |       instance_key: model
21 |       pass_type: BY_REFERENCE
22 |     checkpoint_path: <CHECKPOINT_PATH>


--------------------------------------------------------------------------------
/tutorials/getting_started/configs/example_dataset_config_test.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: data/raw/redpajama_v2_samples_512_test.jsonl
 3 |   dst_path: data/mem_map/redpajama_v2_samples_512_test.pbin
 4 |   index_path: data/mem_map/redpajama_v2_samples_512_test.idx
 5 |   jq_pattern: .raw_content
 6 |   num_cpus: ${node_env:num_cpus}
 7 |   eod_token: <|endoftext|>
 8 |   processing_batch_size: 1000
 9 |   raw_samples_queue_size: 300
10 |   processed_samples_queue_size: 300
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_hf_tokenizer
15 |   config:
16 |     pretrained_model_name_or_path: tokenizer
17 |     padding: false
18 |     truncation: false
19 | 


--------------------------------------------------------------------------------
/tutorials/getting_started/configs/example_dataset_config_train.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: data/raw/redpajama_v2_samples_512_train.jsonl
 3 |   dst_path: data/mem_map/redpajama_v2_samples_512_train.pbin
 4 |   index_path: data/mem_map/redpajama_v2_samples_512_train.idx
 5 |   jq_pattern: .raw_content
 6 |   num_cpus: ${node_env:num_cpus}
 7 |   eod_token: <|endoftext|>
 8 |   processing_batch_size: 1000
 9 |   raw_samples_queue_size: 300
10 |   processed_samples_queue_size: 300
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_hf_tokenizer
15 |   config:
16 |     pretrained_model_name_or_path: tokenizer
17 |     padding: false
18 |     truncation: false
19 | 


--------------------------------------------------------------------------------
/tutorials/getting_started/data/mem_map/.git_keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/getting_started/data/mem_map/.git_keep


--------------------------------------------------------------------------------
/tutorials/getting_started/scripts/run_checkpoint_conversion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e 
 3 | 
 4 | # ---------------------------------------------
 5 | # bash run_checkpoint_conversion 
 6 | # ---------------------------------------------
 7 | 
 8 | #######################
 9 | ### INPUT ARGUMENTS ###
10 | #######################
11 | if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
12 |   then
13 |     echo "Need to specify arguments, e.g. bash run_checkpoint_conversion modalities_config output_dir"
14 |     exit
15 | fi
16 | 
17 | #############
18 | ### RUN #####
19 | #############
20 | echo "> run checkpoint conversion"
21 | echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2 "--num_testruns 5"
22 | python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2 --num_testruns 5
23 | 


--------------------------------------------------------------------------------
/tutorials/getting_started/scripts/run_getting_started_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e 
 3 | 
 4 | # ---------------------------------------------
 5 | # bash run_getting_started_example.sh 0 1
 6 | # (can only be run on 2 GPUs using this script)
 7 | # ---------------------------------------------
 8 | 
 9 | #######################
10 | ### INPUT ARGUMENTS ###
11 | #######################
12 | if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
13 |   then
14 |     echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1"
15 |     exit
16 | fi
17 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]]  # if one of the two input arguments is not an integer 0-7
18 |     then
19 |         echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1"
20 |         exit
21 | fi
22 | 
23 | CUDA_VISIBLE_DEVICES="$1,$2"
24 | 
25 | #############
26 | ### RUN #####
27 | #############
28 | echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES
29 | 
30 | modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl
31 | modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl
32 | modalities data pack_encoded_data configs/example_dataset_config_train.yaml
33 | modalities data pack_encoded_data configs/example_dataset_config_test.yaml
34 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path configs/example_config.yaml
35 | 


--------------------------------------------------------------------------------
/tutorials/getting_started/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "added_tokens_decoder": {
 4 |     "50256": {
 5 |       "content": "<|endoftext|>",
 6 |       "lstrip": false,
 7 |       "normalized": true,
 8 |       "rstrip": false,
 9 |       "single_word": false,
10 |       "special": true
11 |     }
12 |   },
13 |   "bos_token": "<|endoftext|>",
14 |   "clean_up_tokenization_spaces": true,
15 |   "eos_token": "<|endoftext|>",
16 |   "model_max_length": 1024,
17 |   "tokenizer_class": "GPT2Tokenizer",
18 |   "unk_token": "<|endoftext|>"
19 | }
20 | 


--------------------------------------------------------------------------------
/tutorials/library_usage/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | import torch
 5 | from pydantic import BaseModel
 6 | 
 7 | from modalities.__main__ import Main
 8 | from modalities.batch import DatasetBatch
 9 | from modalities.config.config import ProcessGroupBackendType
10 | from modalities.config.instantiation_models import TrainingComponentsInstantiationModel
11 | from modalities.models.gpt2.collator import CollateFnIF
12 | from modalities.running_env.cuda_env import CudaEnv
13 | 
14 | 
15 | class CustomGPT2LLMCollateFnConfig(BaseModel):
16 |     sample_key: str
17 |     target_key: str
18 |     custom_attribute: str
19 | 
20 | 
21 | class CustomGPT2LLMCollateFn(CollateFnIF):
22 |     def __init__(self, sample_key: str, target_key: str, custom_attribute: str):
23 |         self.sample_key = sample_key
24 |         self.target_key = target_key
25 |         self.custom_attribute = custom_attribute
26 |         self._num_calls = 0
27 | 
28 |     @property
29 |     def num_calls(self) -> int:
30 |         return self._num_calls
31 | 
32 |     def __call__(self, batch: list[list[int]]) -> DatasetBatch:
33 |         sample_tensor = torch.tensor(batch)
34 |         samples = {self.sample_key: sample_tensor[:, :-1]}
35 |         targets = {self.target_key: sample_tensor[:, 1:]}
36 |         self._num_calls += 1
37 |         return DatasetBatch(targets=targets, samples=samples)
38 | 
39 | 
40 | def main():
41 |     # load and parse the config file
42 |     cwd = Path(__file__).parent
43 |     # change to cwd
44 |     os.chdir(cwd)
45 |     config_file_path = cwd / Path("config_lorem_ipsum.yaml")
46 | 
47 |     with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
48 |         # instantiate the Main entrypoint of modalities by passing in the config path
49 |         modalities_main = Main(config_path=config_file_path)
50 | 
51 |         # add the custom component to modalities
52 |         modalities_main.add_custom_component(
53 |             component_key="collate_fn",
54 |             variant_key="custom_gpt_2_llm_collator",
55 |             custom_component=CustomGPT2LLMCollateFn,
56 |             custom_config=CustomGPT2LLMCollateFnConfig,
57 |         )
58 |         # run the experiment
59 |         components: TrainingComponentsInstantiationModel = modalities_main.build_components(
60 |             components_model_type=TrainingComponentsInstantiationModel
61 |         )
62 |         modalities_main.run(components)
63 | 
64 |         collate_fn = components.train_dataloader.collate_fn
65 |         if collate_fn.num_calls < 1:
66 |             raise ValueError("Custom collator was not called during training.")
67 |         print(f"Custom collator was called {collate_fn.num_calls} times during training.")
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/tutorials/library_usage/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1 torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 main.py


--------------------------------------------------------------------------------
/tutorials/library_usage/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "added_tokens_decoder": {
 4 |     "50256": {
 5 |       "content": "<|endoftext|>",
 6 |       "lstrip": false,
 7 |       "normalized": true,
 8 |       "rstrip": false,
 9 |       "single_word": false,
10 |       "special": true
11 |     }
12 |   },
13 |   "bos_token": "<|endoftext|>",
14 |   "clean_up_tokenization_spaces": true,
15 |   "eos_token": "<|endoftext|>",
16 |   "model_max_length": 1024,
17 |   "tokenizer_class": "GPT2Tokenizer",
18 |   "unk_token": "<|endoftext|>"
19 | }
20 | 


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/README.md:
--------------------------------------------------------------------------------
 1 | # Getting started with Modalities in 15 minutes
 2 | 
 3 | Throughout the tutorial, we will use the Jupyter Notebook `modalities_demo.ipynb` to guide us through the process of getting started with Modalities. The notebook is located in the root directory of the tutorial, along with the `configs` and `data` directories. The `configs` directory contains configuration files for the model pretraining and tokenization, while the `data` directory contains subdirectories for storing checkpoints, preprocessed data, raw data, and tokenizer-related files.
 4 | 
 5 | ```text
 6 | └── getting_started_15mins                 # Root directory for the tutorial
 7 |     ├── modalities_demo.ipynb              # Jupyter Notebook which we will be using for the tutorial.
 8 |     ├── configs                      
 9 |     │   ├── pretraining_config.yaml        # Config file for the model pretraining
10 |     │   └── tokenization_config.yaml       # Config file for tokenization
11 |     └── data                         
12 |         ├── checkpoints                    # Dir where model and optimizer checkpoints  are stored.
13 |         │   └── <checkpoints>        
14 |         ├── preprocessed                   # Dir containing preprocessed training and evaluation data.
15 |         │   └── <files>              
16 |         ├── raw                      
17 |         │   └── fineweb_edu_num_docs_483606.jsonl   # JSONL file containing raw data for training and evaluation.
18 |         └── tokenizer                
19 |             ├── tokenizer.json             # JSON file defining the tokenizer model, including token mappings.
20 |             └── tokenizer_config.json      # Config file specifying all tokenizer settings
21 | ```
22 | 
23 | 
24 | To start the tutorial check out the Jupyter Notebook `modalities_demo.ipynb` and follow the instructions provided in the notebook.
25 | If you do not have Jupyter Notebook installed in your python environment yet, you can install it by running the following command:
26 | 
27 | ```bash
28 | pip install jupyterlab
29 | ```


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/configs/tokenization_config.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: data/raw/fineweb_edu_num_docs_483606.jsonl
 3 |   dst_path: data/preprocessed/fineweb_edu_num_docs_483606.pbin
 4 |   index_path: data/preprocessed/fineweb_edu_num_docs_483606.idx
 5 |   jq_pattern: .text
 6 |   num_cpus: ${node_env:num_cpus}
 7 |   eod_token: <|endoftext|>
 8 |   processing_batch_size: 10
 9 |   raw_samples_queue_size: 300
10 |   processed_samples_queue_size: 300
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_hf_tokenizer
15 |   config:
16 |     pretrained_model_name_or_path: data/tokenizer
17 |     padding: false
18 |     truncation: false


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/data/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/data/preprocessed/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/preprocessed/.gitkeep


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/data/raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/raw/.gitkeep


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/data/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "added_tokens_decoder": {
 4 |     "50256": {
 5 |       "content": "<|endoftext|>",
 6 |       "lstrip": false,
 7 |       "normalized": true,
 8 |       "rstrip": false,
 9 |       "single_word": false,
10 |       "special": true
11 |     }
12 |   },
13 |   "bos_token": "<|endoftext|>",
14 |   "clean_up_tokenization_spaces": true,
15 |   "eos_token": "<|endoftext|>",
16 |   "model_max_length": 1024,
17 |   "tokenizer_class": "GPT2Tokenizer",
18 |   "unk_token": "<|endoftext|>"
19 | }
20 | 


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/res/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/res/banner.jpg


--------------------------------------------------------------------------------
/tutorials/modalities_in_15_mins/res/notebooks_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/res/notebooks_1.png


--------------------------------------------------------------------------------
/tutorials/warmstart/README.md:
--------------------------------------------------------------------------------
 1 | # Warmstart Tutorial
 2 | 
 3 | In this tutorial, we demonstrate how you can continue the training from a checkpoint, e.g., after the training was interrupted or had crashed. 
 4 | 
 5 | ## Prerequisites
 6 | We will use the data from the [Modalities in 15 mins Tutorial](../modalities_in_15_mins/modalities_demo.ipynb). 
 7 | If you haven't already, please run the data generation part of the notebook to generate the data.
 8 | 
 9 | 
10 | # Running and warmstarting the model training
11 | 
12 | To train the model, we will execute the configuration file `pretrain.yaml` stored in folder `config`, as follows:
13 | 
14 | ```bash
15 | CUDA_VISIBLE_DEVICES="5,6" torchrun \
16 |   --rdzv-endpoint localhost:29516  \
17 |   --nnodes 1   \
18 |   --nproc_per_node 2   \
19 |   $(which modalities) run \
20 |   --config_file_path configs/pre_training_config.yaml
21 | ```
22 | 
23 | 
24 | We will interrupt the training manually (e.g., CTRL + C) after the 250 steps checkpoint has been written out to `data/checkpoints/`.
25 | 
26 | To continue the training from the checkpoint, we will execute the configuration file `warmstart.yaml` stored in folder `config`, running the command below. 
27 | Note, that we have to change the paths under `warmstart_checkpoint_paths` in `warmstart.yaml` such that it points to the correct model and optimizer checkpoint files.
28 | 
29 | ```bash
30 | CUDA_VISIBLE_DEVICES="5,6" torchrun \
31 |   --rdzv-endpoint localhost:29516  \
32 |   --nnodes 1   \
33 |   --nproc_per_node 2   \
34 |   $(which modalities) run \
35 |   --config_file_path configs/warmstart.yaml
36 | ```
37 | 
38 | 
39 | Note that warmstarts do not require you to run the training on the exact same hardware. You can adapt the number of GPUs, number of tokens per batch, etc. in the command line arguments and in the configuration file. 
40 | However, the training result is most likely not exactly the same as if you had continued the training on the same hardware.
41 | 
42 | We specify consistency checks in the configuration file, such as 
43 | ```yaml
44 |   consistency_enforcement:
45 |     enforce_tokens_per_step_consistency: true
46 |     enforce_last_step_logged: false
47 |     enforce_last_step_evaluated: false
48 |     enforce_last_step_checkpointed: false
49 | ```
50 | which can be relaxed to only print warnings instead of raising exceptions. 
51 | 
52 | 


--------------------------------------------------------------------------------
/tutorials/warmstart/configs/tokenization_config_train.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   src_path: ../../getting_started/data/raw/redpajama_v2_samples_512_train.jsonl
 3 |   dst_path: ../data/mem_map/redpajama_v2_samples_512_train.pbin
 4 |   index_path: ../data/mem_map/redpajama_v2_samples_512_train.idx
 5 |   jq_pattern: .raw_content
 6 |   num_cpus: ${node_env:num_cpus}
 7 |   eod_token: <|endoftext|>
 8 |   processing_batch_size: 1000
 9 |   raw_samples_queue_size: 300
10 |   processed_samples_queue_size: 300
11 | 
12 | tokenizer:
13 |   component_key: tokenizer
14 |   variant_key: pretrained_hf_tokenizer
15 |   config:
16 |     pretrained_model_name_or_path: ../../getting_started/tokenizer
17 |     padding: false
18 |     truncation: false
19 | 


--------------------------------------------------------------------------------
/tutorials/warmstart/scripts/check_checkpoint_consistency.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import re
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def _get_checkpoint_file_name_without_eid(checkpoint_file_name: str) -> str:
 8 |     # Remove the experiment id from the checkpoint file name
 9 |     return re.sub(r"^eid_\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}_[a-f0-9]+-", "", checkpoint_file_name)
10 | 
11 | 
12 | def test_checkpoint_files_exist(checkpoint_folder_path: list[Path], expected_checkpoint_names: list[str]):
13 |     # Check if all the checkpoint files exist and have the correct names
14 |     checkpoint_paths = glob.glob(str(checkpoint_folder_path / "**/*"), recursive=True)
15 | 
16 |     assert len(checkpoint_paths) == 17, "ERROR! Expected 6 checkpoint files."
17 | 
18 |     assert len([p for p in checkpoint_paths if p.endswith(".distcp")]), "ERROR! Expected 6 checkpoint files."
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     current_file_path = Path(__file__).resolve()
23 |     os.chdir(current_file_path.parent)
24 | 
25 |     checkpoint_folder_path = Path("../data/checkpoints")
26 | 
27 |     expected_checkpoint_folder_names = [
28 |         # pretrain checkpoint
29 |         "seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920",
30 |         # warmstart checkpoints
31 |         "seen_steps_15-seen_tokens_61440-target_steps_20-target_tokens_81920",
32 |         "seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920",
33 |     ]
34 | 
35 |     test_checkpoint_files_exist(checkpoint_folder_path, expected_checkpoint_folder_names)
36 | 


--------------------------------------------------------------------------------
/tutorials/warmstart/scripts/pre_train_and_warmstart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -ex
 3 | 
 4 | 
 5 | # ---------------------------------------------
 6 | # sh pre_train_and_warmstart.sh 0 1
 7 | # (can only be run on 2 GPUs using this script)
 8 | # ---------------------------------------------
 9 | 
10 | #######################
11 | ### INPUT ARGUMENTS ###
12 | #######################
13 | if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
14 |   then
15 |     echo "Need to specify 2 GPU devices as arguments, e.g. sh pre_train_and_warmstart.sh 0 1"
16 |     exit
17 | fi
18 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]]  # if one of the two input arguments is not an integer 0-7
19 |     then
20 |         echo "Need to specify integers 0-7 as arguments, e.g. sh pre_train_and_warmstart.sh 0 1"
21 |         exit
22 | fi
23 | 
24 | CUDA_VISIBLE_DEVICES="$1,$2"
25 | 
26 | 
27 | 
28 | echo "> run warmstart example on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES
29 | 
30 | # cd to the directory of the script (absolute path)
31 | cd "$(dirname "$0")"
32 | 
33 | rm -rf ../data/
34 | 
35 | 
36 | # run preprocessing
37 | modalities data create_raw_index --index_path ../data/mem_map/redpajama_v2_samples_512_train.idx ../../getting_started/data/raw/redpajama_v2_samples_512_train.jsonl
38 | modalities data pack_encoded_data ../configs/tokenization_config_train.yaml
39 | 
40 | # run pretraining 
41 | 
42 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path ../configs/pre_training_config.yaml
43 | 
44 | # run warmstart
45 | checkpoint_path=$(find ../data/checkpoints -name "last_checkpoint_info.json" -exec realpath {} \;)
46 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 $(which modalities) warmstart --config_file_path ../configs/warmstart_config.yaml --last_checkpoint_info_file_path $checkpoint_path
47 | 
48 | # add some consistency checks
49 | python check_checkpoint_consistency.py
50 | 
51 | rm -rf ../data/
52 | 
53 | echo "Finished warmstart example"
54 | 


--------------------------------------------------------------------------------