├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yaml │ ├── config.yaml │ ├── documentation.yaml │ └── feature-request.yaml ├── pull_request_template.md └── workflows │ ├── build_and_deploy_documentation.yml │ ├── check_arc_runner_env.yml │ ├── linting.yml │ ├── release_automation.yml │ └── tests_full.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG_DEV.md ├── CITATION.cff ├── CONTRIBUTING.md ├── Dataset.md ├── LICENSE ├── MMAP_DATASET_README.md ├── README.md ├── config_files ├── data_preparation │ ├── packed_cc_en_2048.yaml │ └── packed_dataset_config.yaml ├── text_generation │ └── text_generation_config_torch.yaml └── training │ ├── config_example_coca.yaml │ ├── config_lorem_ipsum_long_fsdp1.yaml │ ├── config_lorem_ipsum_long_fsdp1_warmstart.yaml │ ├── config_lorem_ipsum_long_fsdp2.yaml │ └── config_lorem_ipsum_long_fsdp2_warmstart.yaml ├── data ├── checkpoints │ └── .gitkeep ├── lorem_ipsum.idx ├── lorem_ipsum.jsonl ├── lorem_ipsum.pbin ├── lorem_ipsum_long.idx ├── lorem_ipsum_long.jsonl ├── lorem_ipsum_long.pbin ├── tokenizer │ ├── hf_gpt2 │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.json │ ├── sentencepiece_dclm │ │ └── en_32k_tokenizer.model │ └── tokenizer_gpt2.json └── wiki_data_downloader.sh ├── docs ├── Makefile ├── components │ └── components.md ├── dev_ops │ ├── release_procedure.md │ └── tests.md ├── fsdp1_vs_fsdp_2.md ├── make.bat ├── requirements.txt ├── scaling_experiments │ ├── scaling_28B_mbs_1_ac_True.png │ ├── scaling_leonardo.md │ └── scaling_mn5.md ├── source │ ├── banner.jpg │ ├── benchmarking.rst │ ├── conf.py │ ├── configuration.rst │ ├── entrypoints.rst │ ├── future_work.rst │ ├── index.rst │ ├── known_issues.rst │ ├── logo.jpg │ ├── memmap.rst │ ├── model_cards.rst │ ├── quickstart.rst │ └── vs_code_setup.rst └── supported_features.md ├── notebooks ├── components.yaml ├── redpajama_tokenizer_test.ipynb └── tokenizer │ ├── redpajama_v2_samples_512_test.idx │ ├── redpajama_v2_samples_512_test.pbin │ └── unigram_tokenizer.model ├── pyproject.toml ├── scripts ├── convco_for_reverts.sh └── train.sh ├── src └── modalities │ ├── __init__.py │ ├── __main__.py │ ├── api.py │ ├── batch.py │ ├── checkpointing │ ├── __init__.py │ ├── checkpoint_conversion.py │ ├── checkpoint_loading.py │ ├── checkpoint_saving.py │ ├── checkpoint_saving_execution.py │ ├── checkpoint_saving_instruction.py │ ├── checkpoint_saving_strategies.py │ ├── fsdp │ │ ├── __init__.py │ │ ├── fsdp_checkpoint_loading.py │ │ └── fsdp_checkpoint_saving.py │ ├── stateful │ │ ├── __init__.py │ │ ├── app_state.py │ │ └── app_state_factory.py │ └── torch │ │ ├── __init__.py │ │ └── torch_checkpoint_loading.py │ ├── config │ ├── __init__.py │ ├── component_factory.py │ ├── config.py │ ├── instantiation_models.py │ ├── lookup_enum.py │ ├── pydantic_if_types.py │ └── utils.py │ ├── conversion │ ├── __init__.py │ └── gpt2 │ │ ├── __init__.py │ │ ├── configuration_gpt2.py │ │ ├── conversion_code.py │ │ ├── conversion_model.py │ │ ├── conversion_tokenizer.py │ │ ├── convert_gpt2.py │ │ └── modeling_gpt2.py │ ├── dataloader │ ├── __init__.py │ ├── create_index.py │ ├── create_packed_data.py │ ├── dataloader.py │ ├── dataloader_factory.py │ ├── dataset.py │ ├── dataset_factory.py │ ├── large_file_lines_reader.py │ ├── preprocessing │ │ ├── __init__.py │ │ └── tokenization │ │ │ └── tokenized_file_writer.py │ └── samplers.py │ ├── evaluator.py │ ├── exceptions.py │ ├── gym.py │ ├── inference │ ├── __init__.py │ ├── inference.py │ └── text │ │ ├── __init__.py │ │ ├── config.py │ │ └── inference_component.py │ ├── logging_broker │ ├── __init__.py │ ├── message_broker.py │ ├── messages.py │ ├── publisher.py │ ├── subscriber.py │ └── subscriber_impl │ │ ├── __init__.py │ │ ├── progress_subscriber.py │ │ ├── results_subscriber.py │ │ └── subscriber_factory.py │ ├── loss_functions.py │ ├── models │ ├── __init__.py │ ├── coca │ │ ├── __init__.py │ │ ├── attention_pooling.py │ │ ├── coca_model.py │ │ ├── collator.py │ │ ├── multi_modal_decoder.py │ │ └── text_decoder.py │ ├── components │ │ ├── __init__.py │ │ └── layer_norms.py │ ├── gpt2 │ │ ├── __init__.py │ │ ├── collator.py │ │ ├── gpt2_model.py │ │ └── pretrained_gpt_model.py │ ├── huggingface │ │ ├── __init__.py │ │ └── huggingface_model.py │ ├── huggingface_adapters │ │ ├── __init__.py │ │ └── hf_adapter.py │ ├── model.py │ ├── model_factory.py │ ├── utils.py │ └── vision_transformer │ │ ├── __init__.py │ │ └── vision_transformer_model.py │ ├── nn │ ├── __init__.py │ ├── attention.py │ ├── mlp.py │ └── model_initialization │ │ ├── __init__.py │ │ ├── composed_initialization.py │ │ ├── initialization_if.py │ │ ├── initialization_routines.py │ │ └── parameter_name_filters.py │ ├── optimizers │ ├── __init__.py │ ├── lr_schedulers.py │ └── optimizer_factory.py │ ├── preprocessing │ ├── __init__.py │ ├── create_chunks.py │ └── shuffle_data.py │ ├── registry │ ├── __init__.py │ ├── components.py │ └── registry.py │ ├── running_env │ ├── __init__.py │ ├── cuda_env.py │ ├── env_utils.py │ └── fsdp │ │ ├── __init__.py │ │ ├── device_mesh.py │ │ ├── fsdp_auto_wrapper.py │ │ └── reducer.py │ ├── tokenization │ ├── __init__.py │ └── tokenizer_wrapper.py │ ├── trainer.py │ ├── training │ ├── __init__.py │ ├── activation_checkpointing.py │ ├── gradient_clipping │ │ ├── __init__.py │ │ ├── fsdp_gradient_clipper.py │ │ ├── fsdp_gradient_clipper_config.py │ │ └── gradient_clipper.py │ └── training_progress.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── logging.py │ ├── mfu.py │ ├── number_conversion.py │ ├── seeding.py │ ├── typing.py │ └── verify_tokenization_consistency.py ├── tests ├── __init__.py ├── checkpointing │ ├── __init__.py │ ├── checkpointing_test_utils.py │ ├── configs_for_testing │ │ └── gpt2_config_test.yaml │ ├── fsdp2_gpt2_config.yaml │ ├── gpt2_config.yaml │ ├── pytorch │ │ ├── __init__.py │ │ └── test_torch_checkpoint_loading.py │ ├── test_checkpoint_conversion.py │ ├── test_checkpoint_execution_functions.py │ ├── test_checkpoint_strategies.py │ ├── test_fsdp1_to_disc_checkpointing.py │ └── test_fsdp2_dcp_checkpoint_loading_and_saving.py ├── config │ ├── __init__.py │ ├── components.py │ ├── configs.py │ ├── custom_components.py │ ├── test_component_factory.py │ └── test_configs │ │ ├── config_backward_reference.yaml │ │ ├── config_forward_reference.yaml │ │ ├── config_hierarchical_list_component.yaml │ │ ├── config_multiple_top_level_components_with_references.yaml │ │ ├── config_non_existing_reference.yaml │ │ └── config_single_component.yaml ├── conftest.py ├── conversion │ ├── __init__.py │ ├── gpt2 │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── helper.py │ │ ├── test_conversion_code.py │ │ ├── test_conversion_model.py │ │ ├── test_conversion_tokenizer.py │ │ └── test_convert_gpt2.py │ └── test_configs │ │ └── gpt2_config_test.yaml ├── data │ └── datasets │ │ ├── danish_test_dataset.jsonl │ │ ├── lorem_ipsum_long.idx │ │ ├── lorem_ipsum_long.jsonl │ │ ├── lorem_ipsum_long.pbin │ │ └── lorem_ipsum_without_last_newline.jsonl ├── dataloader │ ├── __init__.py │ ├── distributed │ │ ├── dist_dataloader_config_with_shuffling.yaml │ │ ├── dist_dataloader_config_with_shuffling_and_skipped_batches.yaml │ │ ├── dist_dataloader_config_without_shuffling.yaml │ │ └── test_distributed_dataloader.py │ ├── dummy_sequential_dataset.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── chunking │ │ │ ├── __init__.py │ │ │ └── test_create_chunks.py │ │ └── tokenization │ │ │ ├── __init__.py │ │ │ └── test_tokenized_file_writer.py │ ├── samplers │ │ ├── __init__.py │ │ ├── test_distributed_samplers.py │ │ └── test_sequential_samplers.py │ ├── test_combined_dataset.py │ ├── test_dataloader.py │ ├── test_dummy_dataset.py │ ├── test_end_to_end_indexation_and_tokenization.py │ ├── test_large_file_lines_reader.py │ ├── test_packed_dataset.py │ ├── test_shuffle_tokenized_data.py │ └── yaml_configs │ │ └── skipped_dataloader.yaml ├── end2end_tests │ ├── __init__.py │ ├── custom_components.py │ ├── gpt2_train_num_steps_8.yaml │ ├── gpt2_warm_start_from_step_4.yaml │ ├── lorem_ipsum.pbin │ ├── system_tests │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── fsdp1_gpt2_train_num_steps_8.yaml │ │ │ └── fsdp2_gpt2_train_num_steps_8.yaml │ │ └── test_fsdp_loss_convergence.py │ ├── test_create_filtered_tokenized_dataset.py │ ├── test_create_shuffled_dataset_chunk.py │ ├── test_create_shuffled_jsonl_dataset_chunk.py │ ├── test_fsdp_warmstart.py │ ├── test_shuffle_jsonl_data.py │ ├── test_shuffle_tokenized_data.py │ └── test_utils.py ├── fsdp2_parallelization │ ├── __init__.py │ └── test_full_and_hybrid_sharding.py ├── models │ ├── __init__.py │ ├── coca │ │ ├── __init__.py │ │ ├── coca_config.yaml │ │ ├── test_attention_pooling.py │ │ └── test_coca.py │ ├── components │ │ ├── __init__.py │ │ └── test_layer_norms.py │ ├── test_causal_self_attention.py │ ├── test_hf_adapter.py │ ├── test_model_factory.py │ └── vision_transformer │ │ ├── test_vision_transformer.py │ │ └── vision_transformer_config.yaml ├── nn │ ├── test_attention.py │ └── test_mlp.py ├── run_all_tests.sh ├── run_distributed_tests.sh ├── test_evaluator.py ├── test_gradient_clipping.py ├── test_gym.py ├── test_initialization_fsdp1.py ├── test_initialization_fsdpx.py ├── test_loss_functions.py ├── test_lr_scheduler.py ├── test_main.py ├── test_optimizer_factory.py ├── test_rotary_qkv_transform.py ├── test_tokenization.py ├── test_torch_compile.py ├── test_utils.py ├── test_weight_tying.py ├── test_yaml_configs │ ├── coca_config_initialization.yaml │ ├── config_lorem_ipsum_fsdp1.yaml │ ├── config_lorem_ipsum_fsdp2.yaml │ ├── gpt2_config_initialization.yaml │ ├── gpt2_config_initialization_fsdp1.yaml │ ├── gpt2_config_initialization_fsdp2.yaml │ ├── gpt2_config_mfu_fsdp1.yaml │ ├── gpt2_config_mfu_fsdp2.yaml │ └── gpt2_config_optimizer.yaml ├── tests.py ├── tmp │ └── .gitkeep └── utils │ ├── __init__.py │ ├── test_experiment_id_generation.py │ ├── test_mfu.py │ ├── test_number_conversion.py │ └── test_seeding.py └── tutorials ├── getting_started ├── README.md ├── checkpoints │ └── .gitkeep ├── configs │ ├── example_config.yaml │ ├── example_conversion_config_template.yaml │ ├── example_dataset_config_test.yaml │ ├── example_dataset_config_train.yaml │ └── example_text_generation_config.yaml ├── data │ ├── mem_map │ │ └── .git_keep │ └── raw │ │ ├── redpajama_v2_samples_512_test.jsonl │ │ └── redpajama_v2_samples_512_train.jsonl ├── scripts │ ├── run_checkpoint_conversion.sh │ └── run_getting_started_example.sh └── tokenizer │ ├── tokenizer.json │ └── tokenizer_config.json ├── library_usage ├── README.md ├── config_lorem_ipsum.yaml ├── main.py ├── run.sh └── tokenizer │ ├── tokenizer.json │ └── tokenizer_config.json ├── modalities_in_15_mins ├── README.md ├── configs │ ├── pretraining_config.yaml │ └── tokenization_config.yaml ├── data │ ├── checkpoints │ │ └── .gitkeep │ ├── preprocessed │ │ └── .gitkeep │ ├── raw │ │ └── .gitkeep │ └── tokenizer │ │ ├── tokenizer.json │ │ └── tokenizer_config.json ├── modalities_demo.ipynb └── res │ ├── banner.jpg │ ├── fsdp_bright.svg │ ├── modalities_file_format_bright.svg │ ├── modalities_indexation_bright.svg │ ├── modalities_tokenization_bright.svg │ └── notebooks_1.png └── warmstart ├── README.md ├── configs ├── pre_training_config.yaml ├── tokenization_config_train.yaml └── warmstart_config.yaml └── scripts ├── check_checkpoint_consistency.py └── pre_train_and_warmstart.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | parallel = True 3 | branch = True 4 | source = src/modalities 5 | 6 | 7 | [report] 8 | exclude_lines = 9 | # Exclude lines that match the following patterns 10 | pragma: no cover 11 | if __name__ == .__main__.: 12 | 13 | omit = 14 | /tmp/* 15 | /usr/* 16 | */tests/* 17 | # Add more patterns if necessary 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Submit a bug report to help improve modalities 3 | labels: [ "bug" ] 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: > 9 | #### Before submitting a bug report, please make sure the issue hasn't already been addressed, by searching through [the existing and past issues](https://github.com/Modalities/modalities/issues). 10 | 11 | - type: textarea 12 | id: system-info 13 | attributes: 14 | label: System Info 15 | description: Please share your system info with us. 16 | placeholder: modalities version, platform, python version, ... 17 | validations: 18 | required: true 19 | 20 | - type: textarea 21 | attributes: 22 | label: 🐛 Describe the bug 23 | description: | 24 | Please provide a clear and concise description of what the bug is. If relevant, add a minimal example so that we can reproduce the error by running the code. Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. 25 | placeholder: | 26 | A clear and concise description of what the bug is. 27 | 28 | ```python 29 | # Sample code to reproduce the problem 30 | ``` 31 | 32 | ``` 33 | The error message you got, with the full traceback. 34 | ``` 35 | validations: 36 | required: true 37 | 38 | - type: markdown 39 | attributes: 40 | value: > 41 | Thanks for contributing 🎉! -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yaml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yaml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://modalities.github.io/modalities/ 3 | labels: [ "documentation" ] 4 | 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: 📚 The doc issue 9 | description: > 10 | A clear and concise description of what content in https://modalities.github.io/modalities/ is an issue. 11 | validations: 12 | required: true 13 | 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | 20 | - type: markdown 21 | attributes: 22 | value: > 23 | Thanks for contributing 🎉! -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature Request 2 | description: Submit a proposal/request for a new modalities feature 3 | labels: [ "feature" ] 4 | 5 | body: 6 | - type: textarea 7 | id: feature-request 8 | validations: 9 | required: true 10 | attributes: 11 | label: Feature request 12 | description: | 13 | A clear and concise description of the feature proposal. 14 | 15 | - type: textarea 16 | id: motivation 17 | validations: 18 | required: true 19 | attributes: 20 | label: Motivation 21 | description: | 22 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link it here, too. 23 | 24 | - type: markdown 25 | attributes: 26 | value: > 27 | Thanks for contributing 🎉! -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | This PR .. 4 | 5 | ## General Changes 6 | * .. 7 | 8 | ## Breaking Changes 9 | * .. 10 | 11 | ## Checklist before submitting final PR 12 | - [ ] My PR is minimal and addresses one issue in isolation 13 | - [ ] I have merged the latest version of the target branch into this feature branch 14 | - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc. 15 | - [ ] I have run a sample config for model training 16 | - [ ] I have checked that all tests run through (`python tests/tests.py`) 17 | - [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`) -------------------------------------------------------------------------------- /.github/workflows/build_and_deploy_documentation.yml: -------------------------------------------------------------------------------- 1 | name: "Build Sphinx Documentation" 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | docs: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.11"] 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v3 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | sudo apt-get update 27 | sudo apt-get install git -y 28 | python -m pip install torch==2.6.0 29 | python -m pip install --upgrade pip setuptools wheel 30 | export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE 31 | python -m pip install -e . 32 | python -m pip install myst-parser 33 | python -m pip install sphinx-rtd-theme sphinx-autodoc-typehints sphinx-click sphinx-automodapi texext 34 | - name: "Parse into HTML" 35 | run: | 36 | sphinx-apidoc -o docs/source/api src/modalities 37 | sphinx-build -M html docs/source/ docs/build/ 38 | - name: Deploy to GitHub Pages 39 | uses: peaceiris/actions-gh-pages@v3 40 | with: 41 | publish_branch: gh-pages 42 | github_token: ${{ secrets.GITHUB_TOKEN }} 43 | publish_dir: docs/build/html 44 | force_orphan: true 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /.github/workflows/check_arc_runner_env.yml: -------------------------------------------------------------------------------- 1 | name: Check Arc Runner Environment 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - check_env_workflow 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | runs-on: arc-runner-set 13 | strategy: 14 | matrix: 15 | python-version: ["3.11"] 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v3 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Check arc runner environment 23 | run: | 24 | nvidia-smi 25 | echo $CUDA_VISIBLE_DEVICES -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | pull_request: 9 | types: [review_requested, ready_for_review, auto_merge_enabled] 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.11"] 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install pre-commit 27 | pre-commit install 28 | - name: Analysing the code with pre-commit 29 | run: | 30 | pre-commit run --all-files 31 | -------------------------------------------------------------------------------- /.github/workflows/tests_full.yml: -------------------------------------------------------------------------------- 1 | name: Tests Full 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | runs-on: arc-runner-set 12 | strategy: 13 | matrix: 14 | python-version: ["3.11"] 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | sudo apt-get update 24 | sudo apt-get install curl -y # required by coveralls 25 | sudo apt-get install git -y 26 | python -m pip install torch==2.6.0 27 | python -m pip install --upgrade pip setuptools wheel 28 | export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE 29 | python -m pip install ninja # Lowers compilation time of flash attention significantly 30 | python -m pip install flash-attn --no-build-isolation 31 | python -m pip install -e .[tests] 32 | - name: Run tests 33 | run: | 34 | pytest 35 | # sh tests/run_all_tests.sh 0 1 36 | - name: Coveralls 37 | uses: coverallsapp/github-action@v2 38 | with: 39 | github-token: ${{ secrets.GITHUB_TOKEN }} 40 | # - name: Upload coverage data to coveralls.io 41 | # run: | 42 | # python -m pip install coveralls[toml] 43 | # COVERALLS_INPUT=.coverage_reports/.coverage coveralls --service=github 44 | # env: 45 | # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | 47 | 48 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_install_hook_types: 2 | - pre-commit 3 | - prepare-commit-msg 4 | - commit-msg 5 | repos: 6 | - repo: https://github.com/pycqa/isort 7 | rev: 5.11.5 8 | hooks: 9 | - id: isort 10 | stages: [pre-commit] 11 | - repo: https://github.com/psf/black-pre-commit-mirror 12 | rev: 23.9.1 13 | hooks: 14 | - id: black 15 | language_version: python3.11 16 | stages: [pre-commit] 17 | - repo: https://github.com/astral-sh/ruff-pre-commit 18 | rev: v0.0.278 19 | hooks: 20 | - id: ruff 21 | args: [--fix, --exit-non-zero-on-fix] 22 | stages: [pre-commit] 23 | - repo: local 24 | hooks: 25 | - id: custom-commit-msg 26 | stages: [prepare-commit-msg] 27 | name: "Apply conventional commit constraints to default revert message" 28 | entry: ./scripts/convco_for_reverts.sh 29 | language: system 30 | types: [text] 31 | - repo: https://github.com/LuzianHahn/conventional-pre-commit 32 | rev: v2.4.1 33 | hooks: 34 | - id: conventional-pre-commit 35 | stages: [commit-msg] 36 | args: [feat, fix, ci, chore, test, refactor, debug, docs, perf, revert] 37 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.10" 7 | 8 | python: 9 | install: 10 | - requirements: docs/requirements.txt 11 | 12 | sphinx: 13 | configuration: docs/source/conf.py 14 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: If you use this software, please cite both the article from preferred-citation and the software itself. 3 | authors: 4 | - family-names: Lübbering 5 | given-names: Max 6 | - family-names: Ali 7 | given-names: Mehdi 8 | - family-names: Stollenwerk 9 | given-names: Felix 10 | - family-names: Fromm 11 | given-names: Michael 12 | - family-names: Weber 13 | given-names: Alexander Arno 14 | - family-names: Rutmann 15 | given-names: Richard 16 | title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.' 17 | version: 0.3.2 18 | url: https://github.com/Modalities/modalities 19 | date-released: '2024-12-02' 20 | preferred-citation: 21 | authors: 22 | - family-names: Lübbering 23 | given-names: Max 24 | - family-names: Ali 25 | given-names: Mehdi 26 | - family-names: Stollenwerk 27 | given-names: Felix 28 | - family-names: Fromm 29 | given-names: Michael 30 | - family-names: Weber 31 | given-names: Alexander Arno 32 | - family-names: Rutmann 33 | given-names: Richard 34 | title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.' 35 | url: https://github.com/Modalities/modalities 36 | type: generic 37 | year: '2024' 38 | conference: {} 39 | publisher: {} 40 | -------------------------------------------------------------------------------- /Dataset.md: -------------------------------------------------------------------------------- 1 | # MemMap Datasets 2 | 3 | ## MemMapDataset Index Generator 4 | 5 | The `MemMapDataset` requires an index file providing the necessary pointers into the raw data file. The `MemMapDataset` can create the index file lazily, however, it is advised to create it beforehand. This can be done by running 6 | 7 | ```sh 8 | modalities data create_raw_index 9 | ``` 10 | 11 | The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via `modalities data create_raw_index --help`. 12 | 13 | ## Packed Dataset Generator 14 | 15 | The `PackedMemMapDatasetContinuous` and `PackedMemMapDatasetMegatron` require a packed data file. To create the data file, you first have to generate a `MemMapDataset` index file as described [above](#memmapdataset-index-generator). Assuming the index and raw data are located in the same directory, you can simply execute the following command: 16 | 17 | ```sh 18 | modalities data pack_encoded_data 19 | ``` 20 | 21 | The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via `modalities data pack_encoded_data --help`. 22 | 23 | ### Packed Data Format 24 | 25 | The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts: 26 | 27 | header segment | data segment | index segment 28 | 29 | * **header segment**: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes. 30 | * **data segment**: This section contains a concatenation of all documents in form of 4 bytes sized tokens. 31 | An end-of-sequence token is placed between consecutive documents. 32 | * **index segment**: This section contains a pickled index which locates the documents inside the data segment. 33 | The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the 34 | corresponding document, e.g., `[(start_doc1, len_doc1), (start_doc2, len_doc2), ....]`. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Modalities Project Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config_files/data_preparation/packed_cc_en_2048.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: /workspaces/modalities/data/cc_en/raw/train.jsonl 3 | dst_path: /workspaces/modalities/data/cc_en/processed/train.pbin 4 | index_path: /workspaces/modalities/data/cc_en/processed/train.idx 5 | jq_pattern: .text 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: 8 | processing_batch_size: 1000 9 | raw_samples_queue_size: 300 10 | processed_samples_queue_size: 300 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_sp_tokenizer 15 | config: 16 | tokenizer_model_file: /workspaces/modalities/data/tokenizer/sp_bpe_en/bpe_tokenizer.model 17 | padding: false 18 | truncation: false 19 | -------------------------------------------------------------------------------- /config_files/data_preparation/packed_dataset_config.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: data/lorem_ipsum.jsonl 3 | dst_path: data/lorem_ipsum.pbin 4 | index_path: data/lorem_ipsum.idx 5 | jq_pattern: .text 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: <|endoftext|> 8 | processing_batch_size: 10 9 | raw_samples_queue_size: 20 10 | processed_samples_queue_size: 20 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_hf_tokenizer 15 | config: 16 | pretrained_model_name_or_path: data/tokenizer/hf_gpt2 17 | padding: false 18 | truncation: false 19 | -------------------------------------------------------------------------------- /data/checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/checkpoints/.gitkeep -------------------------------------------------------------------------------- /data/lorem_ipsum.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum.idx -------------------------------------------------------------------------------- /data/lorem_ipsum.pbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum.pbin -------------------------------------------------------------------------------- /data/lorem_ipsum_long.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum_long.idx -------------------------------------------------------------------------------- /data/lorem_ipsum_long.pbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/lorem_ipsum_long.pbin -------------------------------------------------------------------------------- /data/tokenizer/hf_gpt2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "<|endoftext|>", 3 | "eos_token": "<|endoftext|>", 4 | "unk_token": "<|endoftext|>" 5 | } 6 | -------------------------------------------------------------------------------- /data/tokenizer/hf_gpt2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "50256": { 5 | "content": "<|endoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | } 12 | }, 13 | "bos_token": "<|endoftext|>", 14 | "clean_up_tokenization_spaces": true, 15 | "eos_token": "<|endoftext|>", 16 | "model_max_length": 1024, 17 | "tokenizer_class": "GPT2Tokenizer", 18 | "unk_token": "<|endoftext|>" 19 | } 20 | -------------------------------------------------------------------------------- /data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model -------------------------------------------------------------------------------- /data/wiki_data_downloader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create the "data" folder if it doesn't exist 4 | mkdir -p data 5 | 6 | # Download the files into the "data" folder 7 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowAll.csv 8 | wget -P data https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowSep.csv -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/dev_ops/release_procedure.md: -------------------------------------------------------------------------------- 1 | # Releasing in Modalitites 2 | This tutorial describes the procedure to release a new version of the Modalities package. 3 | 4 | ## Release Types 5 | We follow the release types as defined by [Semantic Versioning](https://semver.org/). The version number is defined as `MAJOR.MINOR.PATCH` where: 6 | - `MAJOR` is incremented when you make incompatible API changes, 7 | - `MINOR` is incremented when you add functionality in a backwards-compatible manner, and 8 | - `PATCH` is incremented when you make backwards-compatible bug fixes. 9 | 10 | 11 | ## Releasing a new Modalitites version 12 | 0. Make sure that the main branch is in a clean state. In particular, all tests should pass 13 | 1. Update the version number in the `pyproject.toml` and `CITATION.cff` files. 14 | 2. Commit the version bump via `git commit --no-verify -m ""`, following the versioning convention **v**MAJOR.MINOR.PATCH (Note the leading v!). 15 | The `--no-verify` flag is used to skip the pre-commit hooks. 16 | 3. Run `git push` to push the changes to the remote repository. 17 | 5. Tag the commit with the version number following the convention `git tag `. 18 | 6. Push the tag to the remote repository using `git push --tags`. Note, this command will push all the tags to the remote repository. 19 | This command triggers the [CI/CD pipeline](../../.github/workflows/release_automation.yml) to build and deploy the package to the PyPI repository. 20 | -------------------------------------------------------------------------------- /docs/dev_ops/tests.md: -------------------------------------------------------------------------------- 1 | # Testing Modalities 2 | 3 | Modalities has a threefold setup for testing, namely 4 | 5 | * Main tests
6 | The main tests comprise CPU, single GPU and multi-GPU tests. The latter ones create a distributed environment internally and allow end2end testing of Modalities. 7 | Each of these tests defines its requirements (typically the number of GPUs) in the test and the test will be skipped if the requirements are not met. 8 | 9 | * Torchrun tests
10 | These tests are run from a shell script using torchrun and are typically end2end or at least integration tests. Since we implemented distributed testing using multiprocessing within Modalities, these tests will be integrated into the main tests in the long term. Note that some of the torchrun tests have been already migrated to the main tests. 11 | 12 | * Example / Tutorial tests
13 | These tests take an example config (e.g., training config or a warmstart config) and execute it. The test makes sure that the config can be executed without errors. The test does not check the results of the execution, but only that the execution can be completed without errors. The user has to check manually for errors in the output. 14 | 15 | ## Testing Entry Points 16 | There is a single entrypoint to run all test types specified above. 17 | For a full specification of the test API run 18 | 19 | ```bash 20 | cd modalities 21 | python tests/tests.py --help 22 | ``` 23 | 24 | in your command line. 25 | 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==7.1.2 2 | sphinx-rtd-theme==1.3.0rc1 3 | -------------------------------------------------------------------------------- /docs/scaling_experiments/scaling_28B_mbs_1_ac_True.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/scaling_experiments/scaling_28B_mbs_1_ac_True.png -------------------------------------------------------------------------------- /docs/source/banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/source/banner.jpg -------------------------------------------------------------------------------- /docs/source/benchmarking.rst: -------------------------------------------------------------------------------- 1 | Benchmarking 2 | ============================= 3 | **EDIT "docs/source/benchmarking.rst" IN ORDER TO MAKE CHANGES HERE** 4 | -------------------------------------------------------------------------------- /docs/source/entrypoints.rst: -------------------------------------------------------------------------------- 1 | .. role:: python(code) 2 | :language: python 3 | 4 | .. role:: bash(code) 5 | :language: bash 6 | 7 | 8 | Entrypoints 9 | ======================================================= 10 | 11 | We use `click `_ as a tool to add new entry points and their CLI arguments. 12 | For this we have a main entry point from which all other entry points are started. 13 | 14 | The main entry point is :file:`src/modalities/__main__.py:main()`. 15 | We register other sub-entrypoints by using our main :python:`click.group`, called :python:`main`, as follows: 16 | 17 | .. code-block:: python 18 | 19 | @main.command(name="my_new_entry_point") 20 | 21 | 22 | See the following full example: 23 | 24 | .. code-block:: python 25 | 26 | 27 | import click 28 | import click_pathlib 29 | 30 | 31 | @click.group() 32 | def main() -> None: 33 | pass 34 | 35 | 36 | config_option = click.option( 37 | "--config_file_path", 38 | type=click_pathlib.Path(exists=False), 39 | required=True, 40 | help="Path to a file with the YAML config file.", 41 | ) 42 | 43 | 44 | @main.command(name="do_stuff") 45 | @config_option 46 | @click.option( 47 | "--my_cli_argument", 48 | type=int, 49 | required=True, 50 | help="New integer argument", 51 | ) 52 | def entry_point_do_stuff(config_file_path: Path, my_cli_argument: int): 53 | print(f"Do stuff with {config_file_path} and {my_cli_argument}...) 54 | ... 55 | 56 | if __name__ == "__main__": 57 | main() 58 | 59 | With 60 | 61 | .. code-block:: python 62 | 63 | [project.scripts] 64 | modalities = "modalities.__main__:main" 65 | 66 | in our :file:`pyproject.toml`, we can start only main with :python:`modalities` (which does nothing), or a specific sub-entrypoint e.g. :bash:`modalities do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`. 67 | 68 | Alternatively, directly use :bash:`src/modalities/__main__.py do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`. 69 | -------------------------------------------------------------------------------- /docs/source/future_work.rst: -------------------------------------------------------------------------------- 1 | Future Work 2 | ======================================================= 3 | 4 | The team is currently working on our already established LLM code base to bring in multi-modality into the mix. This extension will be based on ideas similar to CoCa and/or AudioPaLM, which would enable users to either use different encoders for different modalities in conjunction with a text-based decoder, or use a decoder-only architecture. 5 | Future modalities other than text can be used, namely, 6 | 7 | * image 8 | * audio 9 | * video 10 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Modalities' documentation! 2 | ====================================================================== 3 | 4 | We propose a novel training framework for Multimodal Large Language Models (LLMs) that prioritizes code readability and efficiency. 5 | The codebase adheres to the principles of "clean code," minimizing Lines of Code (LoC) while maintaining extensibility. 6 | A single, comprehensive configuration file enables easy customization of various model and training parameters. 7 | 8 | A key innovation is the adoption of a PyTorch-native training loop integrated with the Fully Sharded Data Parallelism (FSDP) technique. 9 | FSDP optimizes memory usage and training speed, enhancing scalability for large-scale multimodal models. 10 | By leveraging PyTorch's native capabilities, our framework simplifies the development process and promotes ease of maintenance. 11 | 12 | The framework's modular design facilitates experimentation with different multimodal architectures and training strategies. 13 | Users can seamlessly integrate diverse datasets and model components, allowing for comprehensive exploration of multimodal learning tasks. 14 | The combination of clean code, minimal configuration, and PyTorch-native training with FSDP contributes to a user-friendly and efficient platform for developing state-of-the-art multimodal language models. 15 | 16 | .. note:: 17 | 18 | This project is under active development. 19 | 20 | .. toctree:: 21 | :caption: Getting Started 22 | 23 | quickstart 24 | configuration 25 | model_cards 26 | benchmarking 27 | known_issues 28 | 29 | .. toctree:: 30 | :caption: Datasets 31 | 32 | memmap 33 | 34 | .. toctree:: 35 | :caption: Entrypoints 36 | 37 | entrypoints 38 | 39 | .. toctree:: 40 | :caption: VSCode Setup 41 | 42 | vs_code_setup 43 | 44 | 45 | .. toctree:: 46 | :caption: Future Work 47 | 48 | future_work 49 | 50 | .. toctree:: 51 | :caption: API 52 | 53 | api/modules -------------------------------------------------------------------------------- /docs/source/known_issues.rst: -------------------------------------------------------------------------------- 1 | Known Issues 2 | ================================================================== 3 | 4 | `GitHub Issues `_ 5 | 6 | 1. hardcoded dataset path :file:`/raid/s3/opengptx/mehdi/temp/temp_data/train_text_document.bin` in :file:`config/config.yaml` 7 | 2. Dependency on weights&biases 8 | -------------------------------------------------------------------------------- /docs/source/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/docs/source/logo.jpg -------------------------------------------------------------------------------- /docs/source/memmap.rst: -------------------------------------------------------------------------------- 1 | .. role:: python(code) 2 | :language: python 3 | 4 | .. role:: bash(code) 5 | :language: bash 6 | 7 | MemMap Datasets 8 | ==================================================== 9 | 10 | MemMapDataset Index Generator 11 | ------------------------------------------------------------------------------ 12 | 13 | The :python:`MemMapDataset` requires an index file providing the necessary pointers into the raw data file. The :python:`MemMapDataset` can create the index file lazily, however, it is advised to create it beforehand. This can be done by running 14 | 15 | .. code-block:: bash 16 | 17 | modalities data create_raw_index 18 | 19 | The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`modalities data create_raw_index --help`. 20 | 21 | Packed Dataset Generator 22 | -------------------------------------------------------------------------------- 23 | 24 | The :python:`PackedMemMapDatasetContinuous` and :python:`PackedMemMapDatasetMegatron` require a packed data file. To create the data file, you first have to generate a :python:`MemMapDataset` index file as described `above `_. Assuming the index and raw data are located in the same directory, you can simply execute the following command: 25 | 26 | .. code-block:: bash 27 | 28 | modalities data pack_encoded_data 29 | 30 | The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`modalities data pack_encoded_data --help`. 31 | 32 | Packed Data Format 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts: 36 | 37 | header segment | data segment | index segment 38 | 39 | * **header segment**: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes. 40 | * **data segment**: This section contains a concatenation of all documents in form of 4 bytes sized tokens. An end-of-sequence token is placed between consecutive documents. 41 | * **index segment**: This section contains a pickled index which locates the documents inside the data segment. The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the corresponding document, e.g., :python:`[(start_doc1, len_doc1), (start_doc2, len_doc2), ....]`. 42 | -------------------------------------------------------------------------------- /docs/source/model_cards.rst: -------------------------------------------------------------------------------- 1 | Model Cards 2 | ==================================================== 3 | 4 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ==================================================== 3 | 4 | Installation 5 | ----------------------------------------------------- 6 | Setup a conda environment `conda create -n modalities python=3.10 & conda activate modalities` and install the requirements `pip install -e .`. 7 | 8 | Setup Dataset 9 | ------------------------------------------------- 10 | To start a training you need to create memmap dataset out of a jsonl file first, then pack it, then run the training. 11 | 12 | .. code-block:: bash 13 | 14 | # Create memmap dataset from jsonl file. 15 | modalities data create_raw_index 16 | 17 | # Create packed dataset. 18 | modalities data pack_encoded_data 19 | 20 | For example, using the lorem ipsum example: 21 | 22 | .. code-block:: bash 23 | 24 | # Create memmap dataset from jsonl file. 25 | modalities data create_raw_index data/lorem_ipsum.jsonl 26 | 27 | # Create packed dataset. 28 | modalities data pack_encoded_data data/lorem_ipsum.jsonl 29 | 30 | Training 31 | ---------------------------------------------------- 32 | To run a training environment variables in a multi-gpu setting are required. 33 | 34 | .. code-block:: bash 35 | 36 | CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes 1 --nproc_per_node 2 --rdzv-endpoint=0.0.0.0:29502 src/modalities/__main__.py run --config_file_path config_files/config_lorem_ipsum.yaml 37 | 38 | 4. **Evaluation:** 39 | WIP add contents 40 | -------------------------------------------------------------------------------- /docs/source/vs_code_setup.rst: -------------------------------------------------------------------------------- 1 | VSCode Setup 2 | ==================================================== 3 | 4 | 5 | 6 | We recommend a docker environment based on the most recent pytorch e.g.: 7 | 8 | .. code-block:: bash 9 | 10 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel 11 | RUN apt-get update && apt-get install -y wget openssh-client git-core bash-completion 12 | RUN wget -O /tmp/git-lfs.deb https://packagecloud.io/github/git-lfs/packages/ubuntu/focal/git-lfs_2.13.3_amd64.deb/download.deb && \ 13 | dpkg -i /tmp/git-lfs.deb && \ 14 | rm /tmp/git-lfs.deb 15 | RUN echo 'source /usr/share/bash-completion/completions/git' >> ~/.bashrc 16 | CMD ["/bin/bash"] 17 | 18 | This works seamlessly in combination with the VSCode DevContainer extention: 19 | 20 | .. code-block:: json 21 | 22 | { 23 | "name": "Dev Container", 24 | "dockerFile": "Dockerfile", 25 | "runArgs": [ 26 | "--network", 27 | "host", 28 | "--gpus", 29 | "all" 30 | ], 31 | "customizations": { 32 | "vscode": { 33 | "settings": { 34 | "terminal.integrated.shell.linux": "/bin/bash" 35 | }, 36 | "extensions": [ 37 | "ms-python.python" 38 | ] 39 | } 40 | } 41 | } 42 | 43 | In VSCode, add this to your :file:`launch.json`: 44 | 45 | .. code-block:: json 46 | 47 | { 48 | "name": "Torchrun Train and Eval", 49 | "type": "python", 50 | "request": "launch", 51 | "module": "torch.distributed.run", 52 | "env": { 53 | "CUDA_VISIBLE_DEVICES": "4,5" 54 | }, 55 | "args": [ 56 | "--nnodes", 57 | "1", 58 | "--nproc_per_node", 59 | "2", 60 | "--rdzv-endpoint=0.0.0.0:29503", 61 | "src/modalities/__main__.py", 62 | "run", 63 | "--config_file_path", 64 | "config_files/config_lorem_ipsum.yaml", 65 | ], 66 | "console": "integratedTerminal", 67 | "justMyCode": true, 68 | "envFile": "${workspaceFolder}/.env", 69 | "cwd": "${workspaceFolder}/modalities" 70 | } 71 | 72 | -------------------------------------------------------------------------------- /notebooks/components.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | referencing_keys: 3 | sample_key: input_ids 4 | training: 5 | local_train_micro_batch_size: 8 6 | sequence_length: 2048 7 | 8 | tokenizer: 9 | component_key: tokenizer 10 | variant_key: pretrained_sp_tokenizer 11 | config: 12 | tokenizer_model_file: /workspaces/modalities/notebooks/tokenizer/unigram_tokenizer.model 13 | padding: false 14 | truncation: false 15 | 16 | train_dataset: 17 | component_key: dataset 18 | variant_key: packed_mem_map_dataset_continuous 19 | config: 20 | raw_data_path: /workspaces/modalities/notebooks/tokenizer/redpyjama_v2_default_DE_num_docs_65536.pbin 21 | block_size: ${settings.training.sequence_length} 22 | sample_key: ${settings.referencing_keys.sample_key} 23 | 24 | val_dataset: 25 | component_key: dataset 26 | variant_key: packed_mem_map_dataset_continuous 27 | config: 28 | raw_data_path: /workspaces/modalities/notebooks/tokenizer/redpyjama_v2_default_DE_num_docs_65536.pbin 29 | block_size: ${settings.training.sequence_length} 30 | sample_key: ${settings.referencing_keys.sample_key} -------------------------------------------------------------------------------- /notebooks/tokenizer/redpajama_v2_samples_512_test.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/redpajama_v2_samples_512_test.idx -------------------------------------------------------------------------------- /notebooks/tokenizer/redpajama_v2_samples_512_test.pbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/redpajama_v2_samples_512_test.pbin -------------------------------------------------------------------------------- /notebooks/tokenizer/unigram_tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/notebooks/tokenizer/unigram_tokenizer.model -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "modalities" 3 | version = "0.3.2" 4 | requires-python = ">=3.10,<3.12" 5 | description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training." 6 | readme = "README.md" 7 | dependencies = [ 8 | "numpy<2.0", 9 | "torch==2.6.0", 10 | "packaging", 11 | "tqdm", 12 | "pyyaml", 13 | "transformers", 14 | "datasets", 15 | "protobuf", 16 | "SentencePiece", 17 | "rich", 18 | "omegaconf", 19 | "pydantic", 20 | "click", 21 | "click_pathlib", 22 | "jq", 23 | "class_resolver", 24 | "wandb", 25 | "einops>=0.7.0", 26 | ] 27 | 28 | [project.urls] 29 | Homepage = "https://github.com/Modalities/modalities" 30 | Issues = "https://github.com/Modalities/modalities/issues" 31 | 32 | [project.optional-dependencies] 33 | linting = ["pre-commit"] 34 | tests = ["pytest", "pytest-cov", "debugpy"] 35 | install_helper = ["ninja"] 36 | 37 | [project.scripts] 38 | modalities = "modalities.__main__:main" 39 | 40 | [build-system] 41 | requires = ["setuptools >= 61.0.0"] 42 | build-backend = "setuptools.build_meta" 43 | 44 | [tool.black] 45 | target-version = ["py310"] 46 | line-length = 120 47 | 48 | [tool.isort] 49 | profile = "black" 50 | line_length = 120 51 | src_paths = ["src", "tests"] 52 | 53 | [tool.ruff] 54 | line-length = 120 55 | 56 | [tool.pytest.ini_options] 57 | addopts = "--cov=src --cov-report term --cov-report html" 58 | 59 | [tool.coverage.run] 60 | branch = true 61 | omit = ["*/src/modalities/dataloader/open_gptx_dataset/*"] 62 | 63 | [tool.coverage.report] 64 | # Regexes for lines to exclude from consideration 65 | exclude_also = [ 66 | # Don't complain about missing debug-only code: 67 | "def __repr__", 68 | "if self\\.debug", 69 | 70 | # Don't complain if tests don't hit defensive assertion code: 71 | "raise AssertionError", 72 | "raise NotImplementedError", 73 | 74 | # Don't complain if non-runnable code isn't run: 75 | "if 0:", 76 | "if __name__ == .__main__.:", 77 | 78 | # Don't complain about abstract methods, they aren't run: 79 | "@(abc\\.)?abstractmethod", 80 | ] 81 | 82 | 83 | ignore_errors = true 84 | 85 | [tool.coverage.html] 86 | directory = "coverage_html_report" -------------------------------------------------------------------------------- /scripts/convco_for_reverts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | commit_msg_file="$1" 4 | default_revert_msg=$(cat "$commit_msg_file") 5 | convco_aligned_revert_msg=$(echo "$default_revert_msg" | sed '1s/^Revert /revert: /') 6 | echo "$convco_aligned_revert_msg" > "$commit_msg_file" 7 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 6 $(which modalities) run --config_file_path ../config_files/config_example_mem_map_dataset.yaml -------------------------------------------------------------------------------- /src/modalities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/__init__.py -------------------------------------------------------------------------------- /src/modalities/checkpointing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/__init__.py -------------------------------------------------------------------------------- /src/modalities/checkpointing/checkpoint_conversion.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from modalities.config.config import load_app_config_dict 4 | from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter, HFModelAdapterConfig 5 | 6 | 7 | class CheckpointConversion: 8 | """Class to convert a PyTorch checkpoint to a Hugging Face checkpoint.""" 9 | 10 | def __init__( 11 | self, 12 | config_file_path: Path, 13 | output_hf_checkpoint_dir: Path, 14 | ): 15 | """ 16 | Initializes the CheckpointConversion object. 17 | 18 | Args: 19 | config_file_path (Path): The path to the configuration file containing the pytorch model configuration. 20 | output_hf_checkpoint_dir (Path): The path to the output Hugging Face checkpoint directory. 21 | 22 | Raises: 23 | ValueError: If the config_file_path does not exist. 24 | 25 | """ 26 | self.output_hf_checkpoint_dir = output_hf_checkpoint_dir 27 | if not config_file_path.exists(): 28 | raise ValueError(f"Could not find {config_file_path}.") 29 | 30 | self.config_dict = load_app_config_dict(config_file_path) 31 | 32 | def convert_pytorch_to_hf_checkpoint(self, prediction_key: str) -> HFModelAdapter: 33 | """ 34 | Converts a PyTorch checkpoint to a Hugging Face checkpoint. 35 | 36 | Args: 37 | prediction_key (str): The prediction key to be used in the HFModelAdapter. 38 | 39 | Returns: 40 | HFModelAdapter: The converted Hugging Face model adapter. 41 | 42 | """ 43 | config = HFModelAdapterConfig(config=self.config_dict) 44 | hf_model = HFModelAdapter(config=config, prediction_key=prediction_key, load_checkpoint=True) 45 | hf_model.save_pretrained(self.output_hf_checkpoint_dir, safe_serialization=False) 46 | return hf_model 47 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/checkpoint_loading.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | 4 | import torch.nn as nn 5 | from torch.optim import Optimizer 6 | 7 | from modalities.checkpointing.stateful.app_state import AppState 8 | 9 | 10 | class DistributedCheckpointLoadingIF(ABC): 11 | """Distributed checkpoint loading interface for loading PyTorch models and optimizer checkpoints.""" 12 | 13 | @abstractmethod 14 | def load_checkpoint_(self, app_state: AppState, checkpoint_dir_path: Path) -> AppState: 15 | """Loads the distributed checkpoint from the specified directory path into the AppState. 16 | 17 | Args: 18 | app_state (AppState): The application state with the model, optimizer and lr scheduler. 19 | checkpoint_dir_path (Path): The directory path to the distributed checkpoint. 20 | 21 | Raises: 22 | NotImplementedError: This abstract method is not implemented and should be overridden in a subclass. 23 | 24 | Returns: 25 | AppState: The application state with the loaded checkpoint. 26 | """ 27 | raise NotImplementedError 28 | 29 | 30 | class FSDP1CheckpointLoadingIF(ABC): 31 | """Checkpoint loading interface for loading PyTorch models and optimizer checkpoints.""" 32 | 33 | @abstractmethod 34 | def load_model_checkpoint(self, model: nn.Module, file_path: Path) -> nn.Module: 35 | """ 36 | Loads a model checkpoint from the specified file path. 37 | 38 | Args: 39 | model (nn.Module): The model to load the checkpoint into. 40 | file_path (Path): The path to the checkpoint file. 41 | 42 | Returns: 43 | nn.Module: The loaded model with the checkpoint parameters. 44 | 45 | Raises: 46 | NotImplementedError: This abstract method is not implemented and should be overridden in a subclass. 47 | """ 48 | raise NotImplementedError 49 | 50 | @abstractmethod 51 | def load_optimizer_checkpoint_( 52 | self, 53 | optimizer: Optimizer, 54 | model: nn.Module, 55 | file_path: Path, 56 | ): 57 | """ 58 | Loads an optimizer checkpoint from the specified file path (in-place). 59 | 60 | Args: 61 | optimizer (Optimizer): The optimizer to load the checkpoint into (in-place). 62 | model (nn.Module): The model associated with the optimizer. 63 | file_path (Path): The path to the checkpoint file. 64 | 65 | Raises: 66 | NotImplementedError: This abstract method is not implemented and should be overridden in a subclass. 67 | """ 68 | raise NotImplementedError 69 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/checkpoint_saving.py: -------------------------------------------------------------------------------- 1 | from modalities.batch import EvaluationResultBatch 2 | from modalities.checkpointing.checkpoint_saving_execution import CheckpointSavingExecutionABC 3 | from modalities.checkpointing.checkpoint_saving_strategies import CheckpointSavingStrategyIF 4 | from modalities.checkpointing.stateful.app_state import AppState 5 | from modalities.training.training_progress import TrainingProgress 6 | 7 | 8 | class CheckpointSaving: 9 | """Class for saving checkpoints based on a savig and execution strategy.""" 10 | 11 | def __init__( 12 | self, 13 | checkpoint_saving_strategy: CheckpointSavingStrategyIF, 14 | checkpoint_saving_execution: CheckpointSavingExecutionABC, 15 | ): 16 | """ 17 | Initializes the CheckpointSaving object. 18 | 19 | Args: 20 | checkpoint_saving_strategy (CheckpointSavingStrategyIF): The strategy for saving checkpoints. 21 | checkpoint_saving_execution (CheckpointSavingExecutionABC): The execution for saving checkpoints. 22 | """ 23 | self.checkpoint_saving_strategy = checkpoint_saving_strategy 24 | self.checkpoint_saving_execution = checkpoint_saving_execution 25 | 26 | def save_checkpoint( 27 | self, 28 | training_progress: TrainingProgress, 29 | evaluation_result: dict[str, EvaluationResultBatch], 30 | app_state: AppState, 31 | early_stoppping_criterion_fulfilled: bool = False, 32 | ): 33 | """ 34 | Saves a checkpoint of the model and optimizer. 35 | 36 | Args: 37 | training_progress (TrainingProgress): The training progress. 38 | evaluation_result (dict[str, EvaluationResultBatch]): The evaluation result. 39 | app_state (AppState): The application state to be checkpointed. 40 | early_stoppping_criterion_fulfilled (bool, optional): 41 | Whether the early stopping criterion is fulfilled. Defaults to False. 42 | """ 43 | checkpointing_instruction = self.checkpoint_saving_strategy.get_checkpoint_instruction( 44 | training_progress=training_progress, 45 | evaluation_result=evaluation_result, 46 | early_stoppping_criterion_fulfilled=early_stoppping_criterion_fulfilled, 47 | ) 48 | 49 | self.checkpoint_saving_execution.run_checkpoint_instruction( 50 | checkpointing_instruction=checkpointing_instruction, 51 | training_progress=training_progress, 52 | app_state=app_state, 53 | ) 54 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/checkpoint_saving_execution.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from modalities.checkpointing.checkpoint_saving_instruction import CheckpointingInstruction 4 | from modalities.checkpointing.stateful.app_state import AppState 5 | from modalities.training.training_progress import TrainingProgress 6 | 7 | 8 | class CheckpointSavingExecutionABC(ABC): 9 | """Abstract class for saving PyTorch model and optimizer checkpoints.""" 10 | 11 | @abstractmethod 12 | def _save_checkpoint(self, app_state: AppState, training_progress: TrainingProgress): 13 | """ 14 | Saves the checkpoint of the model and optimizer. 15 | 16 | Args: 17 | app_state (AppState): The application state to be checkpointed. 18 | training_progress (TrainingProgress): The training progress. 19 | 20 | Raises: 21 | NotImplementedError: This method is not implemented and should be overridden in a subclass. 22 | """ 23 | raise NotImplementedError 24 | 25 | @abstractmethod 26 | def _delete_checkpoint(self, training_progress: TrainingProgress): 27 | """ 28 | Deletes the checkpoint based on the training progress. 29 | 30 | Args: 31 | training_progress (TrainingProgress): The training progress. 32 | 33 | Raises: 34 | NotImplementedError: This abstract method is not implemented and should be overridden in a subclass. 35 | """ 36 | raise NotImplementedError 37 | 38 | def run_checkpoint_instruction( 39 | self, 40 | checkpointing_instruction: CheckpointingInstruction, 41 | training_progress: TrainingProgress, 42 | app_state: AppState, 43 | ): 44 | """ 45 | Runs the checkpoint instruction. 46 | 47 | Args: 48 | checkpointing_instruction (CheckpointingInstruction): The checkpointing instruction. 49 | training_progress (TrainingProgress): The training progress. 50 | app_state (AppState): The application state to be checkpointed. 51 | """ 52 | if checkpointing_instruction.save_current: 53 | self._save_checkpoint(app_state=app_state, training_progress=training_progress) 54 | 55 | for training_progress_to_delete in checkpointing_instruction.checkpoints_to_delete: 56 | self._delete_checkpoint(training_progress=training_progress_to_delete) 57 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/checkpoint_saving_instruction.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from modalities.training.training_progress import TrainingProgress 4 | 5 | 6 | @dataclass 7 | class CheckpointingInstruction: 8 | """ 9 | Represents a checkpointing instruction (i.e., saving and deleting). 10 | 11 | Attributes: 12 | save_current (bool): Indicates whether to save the current checkpoint. 13 | checkpoints_to_delete (list[TrainingProgress]): List of checkpoint IDs to delete. 14 | """ 15 | 16 | save_current: bool = False 17 | checkpoints_to_delete: list[TrainingProgress] = field(default_factory=list) 18 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/fsdp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/fsdp/__init__.py -------------------------------------------------------------------------------- /src/modalities/checkpointing/stateful/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/stateful/__init__.py -------------------------------------------------------------------------------- /src/modalities/checkpointing/stateful/app_state_factory.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | import torch.distributed as dist 5 | import torch.nn as nn 6 | from torch.optim import Optimizer 7 | from torch.optim.lr_scheduler import LRScheduler 8 | 9 | from modalities.checkpointing.fsdp.fsdp_checkpoint_loading import DCPCheckpointLoading 10 | from modalities.checkpointing.stateful.app_state import AppState 11 | 12 | 13 | class AppStateFactory: 14 | """Factory class to create AppState objects.""" 15 | 16 | @staticmethod 17 | def get_raw_app_state( 18 | model: nn.Module, optimizer: Optimizer, lr_scheduler: Optional[LRScheduler] = None 19 | ) -> AppState: 20 | """Creates a new (non-checkpoint loaded) AppState object from an instantiated 21 | model, optimizer, and optional learning rate scheduler. 22 | 23 | Args: 24 | model (nn.Module): The model can be either a non-sharded model, FSDP1 or FSDP2 model. 25 | optimizer (Optimizer): The optimizer can be either a non-sharded optimizer, FSDP1 or FSDP2 optimizer. 26 | lr_scheduler (Optional[LRScheduler], optional): Lr scheduler used during training. Defaults to None. 27 | 28 | Returns: 29 | AppState: The AppState object. 30 | """ 31 | app_state = AppState(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler) 32 | return app_state 33 | 34 | @staticmethod 35 | def get_dcp_checkpointed_app_state_( 36 | raw_app_state: AppState, 37 | checkpoint_dir_path: Path, 38 | ) -> AppState: 39 | """Loads the checkpointed state dict into the raw AppState object 40 | (i.e., non-checkpoint loaded AppState) in-place. 41 | 42 | Args: 43 | raw_app_state (AppState): The raw AppState object. 44 | checkpoint_dir_path (Path): The path to the checkpoint directory. 45 | 46 | Raises: 47 | RuntimeError: Raises an error if the state dict has already been loaded. 48 | 49 | Returns: 50 | AppState: The AppState object with the loaded state dict. 51 | """ 52 | if raw_app_state.is_loaded: 53 | raise RuntimeError( 54 | "Cannot call load_state_dict twice on the same AppState object. " "State dict has already been loaded." 55 | ) 56 | cp_loading = DCPCheckpointLoading(global_rank=dist.get_rank()) 57 | cp_loading.load_checkpoint_(app_state=raw_app_state, checkpoint_dir_path=checkpoint_dir_path) 58 | return raw_app_state 59 | -------------------------------------------------------------------------------- /src/modalities/checkpointing/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/checkpointing/torch/__init__.py -------------------------------------------------------------------------------- /src/modalities/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/config/__init__.py -------------------------------------------------------------------------------- /src/modalities/config/lookup_enum.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class LookupEnum(Enum): 5 | @classmethod 6 | def _missing_(cls, value: str) -> type: 7 | """constructs Enum by member name, if not constructable by value""" 8 | return cls.__dict__[value] 9 | -------------------------------------------------------------------------------- /src/modalities/config/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | from pydantic import BaseModel 5 | 6 | 7 | def convert_base_model_config_to_dict(config: BaseModel) -> dict[Any, Any]: 8 | """ "Converts non-recursively a Pydantic BaseModel to a dictionary.""" 9 | return {key: getattr(config, key) for key in config.model_dump().keys()} 10 | 11 | 12 | def parse_torch_device(device: str | int) -> torch.device: 13 | if isinstance(device, str) and device != "cpu": 14 | raise ValueError(f"Invalid device_id: {device}") 15 | elif isinstance(device, int): 16 | device_id = f"cuda:{device}" 17 | else: 18 | device_id = "cpu" 19 | device = torch.device(device_id) 20 | return device 21 | -------------------------------------------------------------------------------- /src/modalities/conversion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/conversion/__init__.py -------------------------------------------------------------------------------- /src/modalities/conversion/gpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/conversion/gpt2/__init__.py -------------------------------------------------------------------------------- /src/modalities/conversion/gpt2/conversion_code.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def _copy_model_files(output_dir: str): 6 | source_dir = os.path.dirname(__file__) 7 | modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py") 8 | configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py") 9 | shutil.copy(modeling_gpt2_path, output_dir) 10 | shutil.copy(configuration_gpt2_path, output_dir) 11 | 12 | 13 | def _change_modalities_import_to_relative_import(output_dir: str): 14 | target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py") 15 | with open(target_modeling_file, "r") as file: 16 | content = file.read() 17 | content = content.replace("modalities.conversion.gpt2.configuration_gpt2", ".configuration_gpt2") 18 | with open(target_modeling_file, "w") as file: 19 | file.write(content) 20 | 21 | 22 | def transfer_model_code(output_dir: str): 23 | """Copies the required model code to the output directory and replaces modalities imports. 24 | This allows the converted model to be used without the modalities package via: 25 | >>> from transformers import AutoModelForCausalLM 26 | >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True) 27 | 28 | Args: 29 | output_dir (str): Directory of the converted model. 30 | """ 31 | _copy_model_files(output_dir) 32 | _change_modalities_import_to_relative_import(output_dir) 33 | -------------------------------------------------------------------------------- /src/modalities/dataloader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/dataloader/__init__.py -------------------------------------------------------------------------------- /src/modalities/dataloader/dataloader_factory.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from torch.utils.data import BatchSampler 4 | from torch.utils.data.dataset import Dataset 5 | 6 | from modalities.dataloader.dataloader import LLMDataLoader 7 | 8 | 9 | class DataloaderFactory: 10 | @staticmethod 11 | def get_dataloader( 12 | dataloader_tag: str, 13 | dataset: Dataset, 14 | batch_sampler: BatchSampler, 15 | collate_fn: Callable, 16 | num_workers: int, 17 | pin_memory: bool, 18 | ) -> LLMDataLoader: 19 | """ 20 | Factory method for the instantiation of LLMDataLoader. 21 | 22 | Args: 23 | dataloader_tag (str): Tag for the dataloader 24 | dataset (Dataset): Dataset to be used 25 | batch_sampler (BatchSampler): batch sampler for batch-wise sampling from the dataset 26 | collate_fn (Callable): Callable for shaping the batch 27 | num_workers (int): Number of workers for the dataloader 28 | pin_memory (bool): Flag indicating whether to pin memory 29 | Returns: 30 | LLMDataLoader: Instance of LLMDataLoader 31 | """ 32 | dataloader = LLMDataLoader( 33 | dataloader_tag=dataloader_tag, 34 | batch_sampler=batch_sampler, 35 | dataset=dataset, 36 | collate_fn=collate_fn, 37 | num_workers=num_workers, 38 | pin_memory=pin_memory, 39 | ) 40 | 41 | return dataloader 42 | -------------------------------------------------------------------------------- /src/modalities/dataloader/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/dataloader/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/modalities/exceptions.py: -------------------------------------------------------------------------------- 1 | class DatasetNotFoundError(Exception): 2 | pass 3 | 4 | 5 | class BatchStateError(Exception): 6 | pass 7 | 8 | 9 | class CheckpointingError(Exception): 10 | pass 11 | 12 | 13 | class RunningEnvError(Exception): 14 | pass 15 | 16 | 17 | class TimeRecorderStateError(Exception): 18 | pass 19 | 20 | 21 | class OptimizerError(Exception): 22 | pass 23 | 24 | 25 | class ConfigError(Exception): 26 | pass 27 | 28 | 29 | class ModelStateError(Exception): 30 | pass 31 | -------------------------------------------------------------------------------- /src/modalities/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/inference/__init__.py -------------------------------------------------------------------------------- /src/modalities/inference/inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from typing import Optional 4 | 5 | from pydantic import FilePath 6 | 7 | from modalities.config.component_factory import ComponentFactory 8 | from modalities.config.config import ProcessGroupBackendType, load_app_config_dict 9 | from modalities.config.instantiation_models import TextGenerationInstantiationModel 10 | from modalities.inference.text.config import TextInferenceComponentConfig 11 | from modalities.inference.text.inference_component import TextInferenceComponent 12 | from modalities.registry.components import COMPONENTS 13 | from modalities.registry.registry import Registry 14 | from modalities.running_env.cuda_env import CudaEnv 15 | from modalities.running_env.env_utils import is_running_with_torchrun 16 | 17 | 18 | def generate_text(config_path: FilePath, registry: Optional[Registry] = None): 19 | config_dict = load_app_config_dict(config_path) 20 | if registry is None: 21 | registry = Registry(COMPONENTS) 22 | registry.add_entity( 23 | component_key="inference_component", 24 | variant_key="text", 25 | component_type=TextInferenceComponent, 26 | component_config_type=TextInferenceComponentConfig, 27 | ) 28 | component_factory = ComponentFactory(registry=registry) 29 | 30 | if is_running_with_torchrun(): 31 | with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): 32 | components = component_factory.build_components( 33 | config_dict=config_dict, 34 | components_model_type=TextGenerationInstantiationModel, 35 | ) 36 | 37 | else: 38 | components = component_factory.build_components( 39 | config_dict=config_dict, 40 | components_model_type=TextGenerationInstantiationModel, 41 | ) 42 | text_inference_component = components.text_inference_component 43 | 44 | text_inference_component.run() 45 | -------------------------------------------------------------------------------- /src/modalities/inference/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/inference/text/__init__.py -------------------------------------------------------------------------------- /src/modalities/inference/text/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel, field_validator 4 | 5 | from modalities.config.pydantic_if_types import ( 6 | PydanticPytorchDeviceType, 7 | PydanticPytorchModuleType, 8 | PydanticTokenizerIFType, 9 | ) 10 | from modalities.config.utils import parse_torch_device 11 | 12 | 13 | class TextInferenceComponentConfig(BaseModel): 14 | model: PydanticPytorchModuleType 15 | tokenizer: PydanticTokenizerIFType 16 | prompt_template: str 17 | sequence_length: int 18 | temperature: Optional[float] = 1.0 19 | eod_token: Optional[str] = "" 20 | device: PydanticPytorchDeviceType 21 | 22 | @field_validator("device", mode="before") 23 | def parse_device(cls, device) -> PydanticPytorchDeviceType: 24 | return parse_torch_device(device) 25 | -------------------------------------------------------------------------------- /src/modalities/logging_broker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/logging_broker/__init__.py -------------------------------------------------------------------------------- /src/modalities/logging_broker/message_broker.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import defaultdict 3 | 4 | from modalities.logging_broker.messages import Message, MessageTypes 5 | from modalities.logging_broker.subscriber import MessageSubscriberIF 6 | 7 | 8 | class MessageBrokerIF(ABC): 9 | """Interface for message broker objects.""" 10 | 11 | @abstractmethod 12 | def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF): 13 | raise NotImplementedError 14 | 15 | @abstractmethod 16 | def distribute_message(self, message: Message): 17 | raise NotImplementedError 18 | 19 | 20 | class MessageBroker(MessageBrokerIF): 21 | """The MessageBroker sends notifications to its subscribers.""" 22 | 23 | def __init__(self) -> None: 24 | self.subscriptions: dict[MessageTypes, list[MessageSubscriberIF]] = defaultdict(list) 25 | 26 | def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF): 27 | """Adds a single subscriber.""" 28 | self.subscriptions[subscription].append(subscriber) 29 | 30 | def distribute_message(self, message: Message): 31 | """Distributes message to all subscribers.""" 32 | message_type = message.message_type 33 | for subscriber in self.subscriptions[message_type]: 34 | subscriber.consume_message(message=message) 35 | -------------------------------------------------------------------------------- /src/modalities/logging_broker/messages.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from typing import Generic, TypeVar 4 | 5 | 6 | class MessageTypes(Enum): 7 | HIGH_LEVEL_PROGRESS_UPDATE = "HIGH_LEVEL_PROGRESS_UPDATE" 8 | BATCH_PROGRESS_UPDATE = "PROGRESS_UPDATE" 9 | ERROR_MESSAGE = "ERROR_MESSAGE" 10 | EVALUATION_RESULT = "EVALUATION_RESULT" 11 | 12 | 13 | T = TypeVar("T") 14 | 15 | 16 | @dataclass 17 | class Message(Generic[T]): 18 | """An object representing a message.""" 19 | 20 | message_type: MessageTypes 21 | payload: T 22 | global_rank: int = 0 23 | local_rank: int = 0 24 | 25 | 26 | class ExperimentStatus(Enum): 27 | TRAIN = "TRAIN" 28 | EVALUATION = "EVALUATION" 29 | 30 | 31 | @dataclass 32 | class ProgressUpdate: 33 | """Object holding the state of the current batch / step computation progress.""" 34 | 35 | num_steps_done: int 36 | # Note: in case of ExperimentState.TRAIN, dataset_batch_id=global_train_batch_id 37 | experiment_status: ExperimentStatus 38 | dataloader_tag: str 39 | -------------------------------------------------------------------------------- /src/modalities/logging_broker/publisher.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | from modalities.logging_broker.message_broker import Message, MessageBroker 5 | from modalities.logging_broker.messages import MessageTypes 6 | 7 | T = TypeVar("T") 8 | 9 | 10 | class MessagePublisherIF(ABC, Generic[T]): 11 | @abstractmethod 12 | def publish_message(self, payload: T, message_type: MessageTypes): 13 | raise NotImplementedError 14 | 15 | 16 | class MessagePublisher(MessagePublisherIF[T]): 17 | """The MessagePublisher sends messages through a message broker.""" 18 | 19 | def __init__( 20 | self, 21 | message_broker: MessageBroker, 22 | global_rank: int, 23 | local_rank: int, 24 | ): 25 | self.message_broker = message_broker 26 | self.global_rank = global_rank 27 | self.local_rank = local_rank 28 | 29 | def publish_message(self, payload: T, message_type: MessageTypes): 30 | """Publish a message through the message broker.""" 31 | message = Message[T]( 32 | message_type=message_type, 33 | global_rank=self.global_rank, 34 | local_rank=self.local_rank, 35 | payload=payload, 36 | ) 37 | self.message_broker.distribute_message(message) 38 | -------------------------------------------------------------------------------- /src/modalities/logging_broker/subscriber.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Generic, TypeVar 3 | 4 | from modalities.logging_broker.messages import Message 5 | 6 | T = TypeVar("T") 7 | 8 | 9 | class MessageSubscriberIF(ABC, Generic[T]): 10 | """Interface for message subscribers.""" 11 | 12 | @abstractmethod 13 | def consume_message(self, message: Message[T]): 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def consume_dict(self, message_dict: dict[str, Any]): 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /src/modalities/logging_broker/subscriber_impl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/logging_broker/subscriber_impl/__init__.py -------------------------------------------------------------------------------- /src/modalities/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/modalities/models/coca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/coca/__init__.py -------------------------------------------------------------------------------- /src/modalities/models/coca/attention_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from modalities.nn.attention import AttentionConfig, AttentionType, MultiHeadAttention 5 | 6 | 7 | class AttentionPooling(nn.Module): 8 | """Attention pooling class.""" 9 | 10 | def __init__(self, n_embd: int, n_head: int, bias: bool, epsilon: float, attention_config: AttentionConfig = None): 11 | """ 12 | Initializes an instance of the AttentionPooling class. 13 | 14 | Args: 15 | n_embd (int): The size of the embeddings. 16 | 17 | n_head (int): The number of attention heads. 18 | bias (bool): Flag indicating whether to include bias in the layer normalization. 19 | epsilon (float): A small value to avoid division by zero in layer normalization. 20 | attention_config (AttentionConfig, optional): The configuration for attention mechanism. Defaults to None. 21 | 22 | Returns: 23 | None 24 | """ 25 | super().__init__() 26 | self.ln_1 = nn.LayerNorm(normalized_shape=n_embd, bias=bias, eps=epsilon) 27 | self.attn = MultiHeadAttention( 28 | n_embd=n_embd, 29 | n_head=n_head, 30 | attention_config=attention_config, 31 | attention_type=AttentionType.CROSS_ATTENTION, 32 | ) 33 | self.ln_2 = nn.LayerNorm(normalized_shape=n_embd, bias=bias, eps=epsilon) 34 | 35 | def forward(self, queries: torch.Tensor, context: torch.Tensor) -> torch.Tensor: 36 | """ 37 | Forward pass of the attention pooling module. 38 | 39 | Args: 40 | queries (torch.Tensor): The input queries tensor. 41 | context (torch.Tensor): The input context tensor. 42 | 43 | Returns: 44 | torch.Tensor: The output tensor. 45 | """ 46 | x = self.ln_1(context) 47 | x = self.attn(queries, context=x) 48 | x = self.ln_2(x) 49 | return x 50 | -------------------------------------------------------------------------------- /src/modalities/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/components/__init__.py -------------------------------------------------------------------------------- /src/modalities/models/gpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/gpt2/__init__.py -------------------------------------------------------------------------------- /src/modalities/models/gpt2/collator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | from modalities.batch import DatasetBatch 6 | 7 | 8 | class CollateFnIF(ABC): 9 | """CollateFnIF class to define a collate function interface.""" 10 | 11 | @abstractmethod 12 | def __call__(self, batch: list[dict[str, torch.Tensor]]) -> DatasetBatch: 13 | """ 14 | Process a batch of data. 15 | 16 | Args: 17 | batch (list[dict[str, torch.Tensor]]): A list of dictionaries containing tensors. 18 | 19 | Returns: 20 | DatasetBatch: The processed batch of data. 21 | 22 | Raises: 23 | NotImplementedError: This abstract method should be implemented in a subclass. 24 | """ 25 | raise NotImplementedError 26 | 27 | 28 | class GPT2LLMCollateFn(CollateFnIF): 29 | """GPT2LLMCollateFn class to define a collate function for GPT2 language model.""" 30 | 31 | def __init__(self, sample_key: str, target_key: str): 32 | """ 33 | Initializes the Collator object. 34 | 35 | Args: 36 | sample_key (str): The key for accessing the sample data. 37 | target_key (str): The key for accessing the target data. 38 | """ 39 | self.sample_key = sample_key 40 | self.target_key = target_key 41 | 42 | def __call__(self, batch: list[dict[str, torch.Tensor]]) -> DatasetBatch: 43 | """ 44 | Process a batch of data. 45 | 46 | Args: 47 | batch (list[dict[str, torch.Tensor]]): A list of dictionaries containing tensors. 48 | 49 | Returns: 50 | DatasetBatch: A processed batch of data where sample and target sequences are created. 51 | 52 | """ 53 | 54 | sample_tensor = torch.stack([torch.tensor(d[self.sample_key]) for d in batch]) 55 | samples = {self.sample_key: sample_tensor[:, :-1]} 56 | targets = {self.target_key: sample_tensor[:, 1:]} 57 | 58 | return DatasetBatch(targets=targets, samples=samples) 59 | -------------------------------------------------------------------------------- /src/modalities/models/gpt2/pretrained_gpt_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import PreTrainedModel 3 | 4 | from modalities.config.config import PretrainedGPTConfig 5 | from modalities.models.gpt2.gpt2_model import GPT2LLM 6 | 7 | 8 | class PretrainedGPTModel(PreTrainedModel): 9 | """Pretrained GPT model class.""" 10 | 11 | config_class = PretrainedGPTConfig 12 | 13 | def __init__(self, config: PretrainedGPTConfig): 14 | """ 15 | Initializes a PretrainedGPTModel object. 16 | 17 | Args: 18 | config (PretrainedGPTConfig): The configuration object for the model. 19 | 20 | Returns: 21 | None 22 | """ 23 | super().__init__(config) 24 | # TODO offloading the parameters like this is ugly 25 | self.model: GPT2LLM = GPT2LLM(**dict(config.config)) 26 | 27 | def forward(self, tensor): 28 | """ 29 | Forward pass of the pretrained GPT model. 30 | 31 | Args: 32 | tensor (torch.Tensor): The input tensor. 33 | 34 | Returns: 35 | torch.Tensor: The output tensor. 36 | 37 | """ 38 | model_input = {"input_ids": tensor} 39 | model_forward_output: dict[str, torch.Tensor] = self.model(model_input) 40 | return model_forward_output[self.config.config.prediction_key] 41 | 42 | 43 | if __name__ == "__main__": 44 | ... 45 | -------------------------------------------------------------------------------- /src/modalities/models/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/modalities/models/huggingface_adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/huggingface_adapters/__init__.py -------------------------------------------------------------------------------- /src/modalities/models/utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from pydantic import BaseModel 4 | 5 | from modalities.config.component_factory import ComponentFactory 6 | from modalities.config.pydantic_if_types import PydanticPytorchModuleType 7 | from modalities.registry.components import COMPONENTS 8 | from modalities.registry.registry import Registry 9 | 10 | 11 | class ModelTypeEnum(Enum): 12 | """ 13 | Enumeration class representing different types of models. 14 | 15 | Attributes: 16 | MODEL (str): Represents a regular model. 17 | CHECKPOINTED_MODEL (str): Represents a checkpointed model. 18 | """ 19 | 20 | MODEL = "model" 21 | CHECKPOINTED_MODEL = "checkpointed_model" 22 | 23 | 24 | def get_model_from_config(config: dict, model_type: ModelTypeEnum): 25 | """ 26 | Retrieves a model from the given configuration based on the specified model type. 27 | 28 | Args: 29 | config (dict): The configuration dictionary. 30 | model_type (ModelTypeEnum): The type of the model to retrieve. 31 | 32 | Returns: 33 | Any: The model object based on the specified model type. 34 | 35 | Raises: 36 | NotImplementedError: If the model type is not supported. 37 | """ 38 | registry = Registry(COMPONENTS) 39 | component_factory = ComponentFactory(registry=registry) 40 | 41 | # create the pydantic config for the component factory dynamically based on model_type 42 | if model_type.value == "model": 43 | 44 | class PydanticConfig(BaseModel): 45 | model: PydanticPytorchModuleType 46 | 47 | elif model_type.value == "checkpointed_model": 48 | 49 | class PydanticConfig(BaseModel): 50 | checkpointed_model: PydanticPytorchModuleType 51 | 52 | else: 53 | raise NotImplementedError() 54 | 55 | components = component_factory.build_components(config_dict=config, components_model_type=PydanticConfig) 56 | return getattr(components, model_type.value) 57 | -------------------------------------------------------------------------------- /src/modalities/models/vision_transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/models/vision_transformer/__init__.py -------------------------------------------------------------------------------- /src/modalities/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/nn/__init__.py -------------------------------------------------------------------------------- /src/modalities/nn/mlp.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | 3 | from torch import Tensor, nn 4 | 5 | 6 | class MLP(nn.Module): 7 | def __init__( 8 | self, 9 | in_features: int, 10 | hidden_features: Optional[int] = None, 11 | out_features: Optional[int] = None, 12 | bias: bool = True, 13 | dropout: float = 0.0, 14 | act_fn: Callable[[], nn.Module] = nn.GELU, 15 | ): 16 | super().__init__() 17 | out_features = out_features or in_features 18 | hidden_features = hidden_features or 4 * in_features 19 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 20 | self.act = act_fn() 21 | self.drop1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() 22 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 23 | self.drop2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() 24 | 25 | def forward(self, x: Tensor) -> Tensor: 26 | x = self.fc1(x) 27 | x = self.act(x) 28 | x = self.drop1(x) 29 | x = self.fc2(x) 30 | x = self.drop2(x) 31 | return x 32 | -------------------------------------------------------------------------------- /src/modalities/nn/model_initialization/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/modalities/nn/model_initialization/initialization_if.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class ModelInitializationIF(ABC): 7 | @abstractmethod 8 | def initialize_in_place(self, model: nn.Module): 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /src/modalities/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/optimizers/__init__.py -------------------------------------------------------------------------------- /src/modalities/optimizers/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | from torch.optim import Optimizer 5 | from torch.optim.lr_scheduler import LRScheduler 6 | 7 | 8 | class DummyLRScheduler(LRScheduler): 9 | def __init__(self, optimizer: Optimizer, last_epoch: Optional[int] = -1): 10 | super().__init__(optimizer, last_epoch) 11 | 12 | def get_lr(self) -> list[float]: 13 | if not self._get_lr_called_within_step: # type error expected due to internal pytorch implementation 14 | warnings.warn( 15 | "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning 16 | ) 17 | 18 | return [group["lr"] for group in self.optimizer.param_groups] 19 | 20 | def _get_closed_form_lr(self) -> list[float]: 21 | return self.base_lrs 22 | -------------------------------------------------------------------------------- /src/modalities/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/modalities/registry/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/registry/__init__.py -------------------------------------------------------------------------------- /src/modalities/running_env/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/running_env/__init__.py -------------------------------------------------------------------------------- /src/modalities/running_env/cuda_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | 4 | import torch 5 | import torch.distributed as dist 6 | 7 | from modalities.config.config import ProcessGroupBackendType 8 | 9 | 10 | class CudaEnv: 11 | """Context manager to set the CUDA environment for distributed training.""" 12 | 13 | def __init__( 14 | self, 15 | process_group_backend: ProcessGroupBackendType, 16 | ) -> None: 17 | """Initializes the CudaEnv context manager with the process group backend. 18 | 19 | Args: 20 | process_group_backend (ProcessGroupBackendType): Process group backend to be used for distributed training. 21 | """ 22 | self.process_group_backend = process_group_backend 23 | 24 | def __enter__(self) -> "CudaEnv": 25 | """Sets the CUDA environment for distributed training. 26 | 27 | Returns: 28 | CudaEnv: Instance of the CudaEnv context manager. 29 | """ 30 | dist.init_process_group(self.process_group_backend.value) 31 | local_rank = int(os.getenv("LOCAL_RANK", "-1")) 32 | if local_rank == -1: 33 | raise ValueError("LOCAL_RANK environment variable is not set. Please set it before using CudaEnv.") 34 | torch.cuda.set_device(local_rank) 35 | return self 36 | 37 | def __exit__(self, type: Any, value: Any, traceback: Any): 38 | """Exits the CUDA environment for distributed training by destroying the process group. 39 | 40 | Args: 41 | type (Any): 42 | value (Any): 43 | traceback (Any): 44 | """ 45 | # TODO and NOTE: 46 | # when we call barrier here and one of the ranks fails, we get stuck here. 47 | # In the future, we should probably add a timeout here and handle the case when one of the ranks fails. 48 | # dist.barrier() 49 | dist.destroy_process_group() 50 | -------------------------------------------------------------------------------- /src/modalities/running_env/env_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.cuda.nccl as nccl 5 | import torch.distributed as dist 6 | from pydantic import BaseModel 7 | 8 | # TODO find a solution for github actions 9 | # to install this as a dependency 10 | # from pkg_resources import packaging 11 | from torch.distributed.fsdp import MixedPrecision 12 | 13 | from modalities.config.lookup_enum import LookupEnum 14 | 15 | 16 | def is_running_with_torchrun(): 17 | # Check for one of the environment variables set by torchrun 18 | return "LOCAL_RANK" in os.environ 19 | 20 | 21 | def has_bfloat_support(): 22 | return ( 23 | torch.version.cuda 24 | and torch.cuda.is_available() 25 | and torch.cuda.is_bf16_supported() 26 | # TODO find a solution for github actions 27 | # to install this as a dependency 28 | # and packaging.version.parse(torch.version.cuda).release >= (11, 0) 29 | and dist.is_nccl_available() 30 | and nccl.version() >= (2, 10) 31 | ) 32 | 33 | 34 | # requires grad scaler in main loop 35 | fpSixteen = MixedPrecision( 36 | param_dtype=torch.float16, 37 | # Gradient communication precision. 38 | reduce_dtype=torch.float16, 39 | # Buffer precision. 40 | buffer_dtype=torch.float16, 41 | ) 42 | 43 | bfSixteen = MixedPrecision( 44 | param_dtype=torch.bfloat16, 45 | # Gradient communication precision. 46 | reduce_dtype=torch.bfloat16, 47 | # Buffer precision. 48 | buffer_dtype=torch.bfloat16, 49 | ) 50 | 51 | bfSixteen_working = MixedPrecision( 52 | param_dtype=torch.float32, 53 | reduce_dtype=torch.bfloat16, 54 | buffer_dtype=torch.bfloat16, 55 | ) 56 | 57 | megatron_strategy = MixedPrecision( 58 | param_dtype=torch.bfloat16, 59 | reduce_dtype=torch.float32, 60 | # buffer_dtype=torch.bfloat16, 61 | ) 62 | 63 | fpThirtytwo = MixedPrecision( 64 | param_dtype=torch.float32, 65 | reduce_dtype=torch.float32, 66 | buffer_dtype=torch.float32, 67 | ) 68 | 69 | no_mixed_precision = None 70 | 71 | 72 | class MixedPrecisionSettings(LookupEnum): 73 | FP_16 = fpSixteen 74 | BF_16 = bfSixteen 75 | BF_16_WORKING = bfSixteen_working 76 | FP_32 = fpThirtytwo 77 | MIXED_PRECISION_MEGATRON = megatron_strategy 78 | NO_MIXED_PRECISION = no_mixed_precision 79 | 80 | 81 | class PyTorchDtypes(LookupEnum): 82 | FP_16 = torch.float16 83 | FP_32 = torch.float32 84 | BF_16 = torch.bfloat16 85 | 86 | 87 | class FSDP2MixedPrecisionSettings(BaseModel): 88 | param_dtype: PyTorchDtypes 89 | reduce_dtype: PyTorchDtypes 90 | -------------------------------------------------------------------------------- /src/modalities/running_env/fsdp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/running_env/fsdp/__init__.py -------------------------------------------------------------------------------- /src/modalities/running_env/fsdp/fsdp_auto_wrapper.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from abc import ABC, abstractmethod 4 | from typing import Callable 5 | 6 | import torch.nn as nn 7 | from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy 8 | 9 | from modalities.config.lookup_enum import LookupEnum 10 | from modalities.util import get_module_class_from_name, print_rank_0 11 | 12 | 13 | class FSDPAutoWrapFactoryIF(ABC): 14 | @abstractmethod 15 | def get_auto_wrap_policy(self) -> Callable: 16 | raise NotImplementedError 17 | 18 | 19 | class FSDPTransformerAutoWrapPolicyFactory(FSDPAutoWrapFactoryIF): 20 | def __init__(self, model: nn.Module, block_names: list[str]) -> None: 21 | # TODO it's problematic that we store the model in-memory here. Might get too large in RAM... 22 | self.model = model 23 | self.block_names = block_names 24 | 25 | @staticmethod 26 | def _get_fsdp_blocks_from_block_names(model: nn.Module, block_names: list[str]) -> list[nn.Module]: 27 | fsdp_block_types = [] 28 | for cls_block_name in block_names: 29 | # TODO FullyShardedDataParallelPlugin from Accelerate uses string matching to find the correct 30 | # block class. In the long-term we should implmement this ourselves in a robuster fashion. 31 | block_type = get_module_class_from_name(model, cls_block_name) 32 | 33 | if block_type is None: 34 | raise ValueError(f"Could not find block with name {cls_block_name} in model") 35 | fsdp_block_types.append(block_type) 36 | return fsdp_block_types 37 | 38 | def get_auto_wrap_policy(self) -> Callable: 39 | transformer_layer_cls = self._get_fsdp_blocks_from_block_names(model=self.model, block_names=self.block_names) 40 | logging.info(f"Wrapped layer classes: {transformer_layer_cls}\n") 41 | print_rank_0(f"\nWrapped layer classes: {transformer_layer_cls}\n") 42 | 43 | if len(transformer_layer_cls) == 0: 44 | raise ValueError("No FSDP blocks found in model") 45 | 46 | auto_wrapper_policy = functools.partial( 47 | transformer_auto_wrap_policy, 48 | transformer_layer_cls={ 49 | *transformer_layer_cls, 50 | }, 51 | ) 52 | return auto_wrapper_policy 53 | 54 | 55 | class FSDPAutoWrapFactoryTypes(LookupEnum): 56 | FSDPTransformerAutoWrapPolicyFactory = FSDPTransformerAutoWrapPolicyFactory 57 | -------------------------------------------------------------------------------- /src/modalities/running_env/fsdp/reducer.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | class Reducer: 8 | @staticmethod 9 | def reduce( 10 | tensor: torch.Tensor, 11 | operation: dist.ReduceOp.RedOpType, 12 | post_processing_fun: Callable[[torch.Tensor], torch.Tensor] = None, 13 | ): 14 | dist.all_reduce(tensor, op=operation) 15 | if post_processing_fun is not None: 16 | tensor = post_processing_fun(tensor) 17 | return tensor 18 | -------------------------------------------------------------------------------- /src/modalities/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/tokenization/__init__.py -------------------------------------------------------------------------------- /src/modalities/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/training/__init__.py -------------------------------------------------------------------------------- /src/modalities/training/activation_checkpointing.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( 5 | CheckpointImpl, 6 | apply_activation_checkpointing, 7 | checkpoint_wrapper, 8 | ) 9 | from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP 10 | 11 | from modalities.util import get_module_class_from_name 12 | 13 | 14 | def is_module_to_apply_activation_checkpointing( 15 | submodule: torch.nn.Module, activation_checkpointing_modules: list[type] 16 | ) -> bool: 17 | return isinstance(submodule, tuple(activation_checkpointing_modules)) 18 | 19 | 20 | def apply_activation_checkpointing_inplace(model: torch.nn.Module, activation_checkpointing_modules: list[str]): 21 | activation_checkpointing_module_types = [ 22 | get_module_class_from_name(model, m) for m in activation_checkpointing_modules 23 | ] 24 | if not isinstance(model, FSDP): 25 | raise ValueError("activation checkpointing can only be applied to FSDP wrapped models!") 26 | non_reentrant_wrapper = partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, debug=False) 27 | 28 | apply_activation_checkpointing( 29 | model, 30 | checkpoint_wrapper_fn=non_reentrant_wrapper, 31 | check_fn=lambda submodule: is_module_to_apply_activation_checkpointing( 32 | submodule, activation_checkpointing_module_types 33 | ), 34 | ) 35 | -------------------------------------------------------------------------------- /src/modalities/training/gradient_clipping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/training/gradient_clipping/__init__.py -------------------------------------------------------------------------------- /src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | from modalities.config.pydantic_if_types import PydanticPytorchModuleType 6 | from modalities.training.gradient_clipping.fsdp_gradient_clipper import GradientClippingMode 7 | 8 | 9 | class FSDPGradientClipperConfig(BaseModel): 10 | """ 11 | Configuration class for FSDP gradient clipper. 12 | 13 | Args: 14 | max_norm (float): The maximum norm value for gradient clipping. 15 | norm_type (GradientClippingMode): The type of gradient clipping to be applied. 16 | wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. 17 | 18 | Attributes: 19 | max_norm (float): The maximum norm value for gradient clipping. 20 | norm_type (GradientClippingMode): The type of gradient clipping to be applied. 21 | wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. 22 | """ 23 | 24 | max_norm: Annotated[float, Field(strict=True, gt=0)] 25 | norm_type: GradientClippingMode 26 | wrapped_model: PydanticPytorchModuleType 27 | 28 | 29 | class FSDPDummyGradientClipperConfig(BaseModel): 30 | """ 31 | Configuration class for FSDP dummy gradient clipper. 32 | 33 | Args: 34 | wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. 35 | norm_type (GradientClippingMode): The type of gradient clipping to be applied. 36 | 37 | Attributes: 38 | wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. 39 | norm_type (GradientClippingMode): The type of gradient clipping to be applied. 40 | """ 41 | 42 | wrapped_model: PydanticPytorchModuleType 43 | norm_type: GradientClippingMode 44 | 45 | 46 | class DummyGradientClipperConfig(BaseModel): 47 | """ 48 | Configuration class for dummy gradient clipper. 49 | 50 | This class is a placeholder and does not have any specific functionality. 51 | 52 | Attributes: 53 | None 54 | 55 | Methods: 56 | None 57 | """ 58 | 59 | pass 60 | -------------------------------------------------------------------------------- /src/modalities/training/gradient_clipping/gradient_clipper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | 6 | class GradientClipperIF(ABC): 7 | """The GradientClipper interface that defines the methods for clipping gradients.""" 8 | 9 | @abstractmethod 10 | def clip_gradients(self) -> torch.Tensor: 11 | """ 12 | Clip the gradients of the model. 13 | 14 | Returns: 15 | torch.Tensor: The clipped gradients. 16 | """ 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /src/modalities/training/training_progress.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class TrainingProgress: 7 | """ 8 | Dataclass to store the training progress. 9 | 10 | Attributes: 11 | 12 | num_seen_steps_current_run (int): Number of seen steps in the current run. 13 | num_seen_tokens_current_run (int): Number of seen tokens in the current run. 14 | num_target_steps (int): Target number of steps. 15 | num_target_tokens (int): Target number of tokens. 16 | num_seen_steps_previous_run (Optional[int]): Number of seen steps in the previous run. 17 | num_seen_tokens_previous_run (Optional[int]): Number of seen tokens in the previous run. 18 | """ 19 | 20 | num_seen_steps_current_run: int 21 | num_seen_tokens_current_run: int 22 | num_target_steps: int 23 | num_target_tokens: int 24 | num_seen_steps_previous_run: Optional[int] = 0 25 | num_seen_tokens_previous_run: Optional[int] = 0 26 | 27 | @property 28 | def num_seen_steps_total(self) -> int: 29 | return self.num_seen_steps_current_run + self.num_seen_steps_previous_run 30 | 31 | @property 32 | def num_seen_tokens_total(self) -> int: 33 | return self.num_seen_tokens_current_run + self.num_seen_tokens_previous_run 34 | -------------------------------------------------------------------------------- /src/modalities/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/src/modalities/utils/__init__.py -------------------------------------------------------------------------------- /src/modalities/utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_logger(name: str = "main") -> logging.Logger: 5 | logger = logging.getLogger(name) 6 | if not logger.handlers: 7 | logger.setLevel(logging.DEBUG) 8 | handler = logging.StreamHandler() 9 | handler.setFormatter(logging.Formatter("%(name)s - %(levelname)s - %(message)s")) 10 | logger.addHandler(handler) 11 | return logger 12 | -------------------------------------------------------------------------------- /src/modalities/utils/seeding.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | 4 | def calculate_hashed_seed(input_data: list[str], max_seed: int = 2**32 - 1) -> int: 5 | # Calculate a seed from a list of strings 6 | # The seed is a number between 0 and max_seed (non-inclusive max_seed) 7 | def _hash_string(input_data: str) -> str: 8 | hash_object = hashlib.sha256(input_data.encode("utf-8")) 9 | hash_hex = hash_object.hexdigest() 10 | return hash_hex 11 | 12 | # even though this becomes an exremely large integer value, 13 | # we don't get overflows as python can represent integers of arbitrary size 14 | # https://docs.python.org/3/library/exceptions.html#OverflowError 15 | hash_strings = [_hash_string(x) for x in input_data] 16 | 17 | hash_sum = sum([int(x, 16) for x in hash_strings]) 18 | 19 | seed = hash_sum % max_seed # Ensure the seed fits within the max_seed range 20 | 21 | return seed 22 | -------------------------------------------------------------------------------- /src/modalities/utils/typing.py: -------------------------------------------------------------------------------- 1 | from torch.distributed.fsdp import FSDPModule as FSDP2 2 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1 3 | 4 | FSDPX = FSDP1 | FSDP2 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/__init__.py -------------------------------------------------------------------------------- /tests/checkpointing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/checkpointing/__init__.py -------------------------------------------------------------------------------- /tests/checkpointing/configs_for_testing/gpt2_config_test.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: gpt2 4 | config: 5 | sample_key: input_ids 6 | poe_type: NOPE 7 | sequence_length: 256 8 | prediction_key: logits 9 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 10 | n_layer: 12 11 | n_head_q: 12 12 | n_head_kv: 12 13 | ffn_hidden: 2048 14 | n_embd: 768 15 | dropout: 0.0 16 | bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster 17 | attention_config: 18 | qkv_transforms: 19 | - type_hint: RotaryTransform 20 | config: 21 | n_embd: ${model.config.n_embd} 22 | n_head: ${model.config.n_head_q} #it has to be head_q here 23 | seq_length_dim: -2 24 | base_freq: 10000 25 | attention_implementation: manual 26 | activation_type: gelu 27 | attention_norm_config: 28 | norm_type: rms_norm 29 | config: 30 | ndim: ${model.config.n_embd} 31 | bias: true 32 | epsilon: 1e-5 33 | ffn_norm_config: 34 | norm_type: rms_norm 35 | config: 36 | ndim: ${model.config.n_embd} 37 | bias: true 38 | epsilon: 1e-5 39 | lm_head_norm_config: 40 | norm_type: rms_norm 41 | config: 42 | ndim: ${model.config.n_embd} 43 | bias: true 44 | epsilon: 1e-5 45 | use_weight_tying: true 46 | 47 | checkpointed_model: 48 | component_key: model 49 | variant_key: fsdp1_checkpointed 50 | config: 51 | checkpoint_loading: 52 | component_key: checkpoint_loading 53 | variant_key: torch 54 | config: 55 | device: 0 56 | precision: BF16 57 | model: 58 | instance_key: model 59 | pass_type: BY_REFERENCE 60 | checkpoint_path: null -------------------------------------------------------------------------------- /tests/checkpointing/gpt2_config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: model_raw 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: gpt2 13 | weight_init_type: scaled 14 | mean: 0.0 15 | std: 0.02 16 | num_layers: ${model_raw.config.n_layer} 17 | 18 | model_raw: 19 | component_key: model 20 | variant_key: gpt2 21 | config: 22 | sample_key: "input_ids" # TODO reference this 23 | poe_type: NOPE 24 | prediction_key: "logits" # TODO reference this 25 | sequence_length: 256 # TODO reference this (same as sequence length) 26 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 27 | n_layer: 2 28 | n_head_q: 4 29 | n_head_kv: 4 30 | ffn_hidden: 128 31 | n_embd: 128 32 | dropout: 0.0 33 | bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 34 | attention_config: 35 | qkv_transforms: 36 | - type_hint: RotaryTransform 37 | config: 38 | n_embd: ${model_raw.config.n_embd} 39 | n_head: ${model_raw.config.n_head_q} #it has to be head_q here 40 | seq_length_dim: -2 41 | base_freq: 10000 42 | attention_implementation: manual 43 | activation_type: gelu 44 | attention_norm_config: 45 | norm_type: rms_norm 46 | config: 47 | ndim: ${model_raw.config.n_embd} 48 | bias: true 49 | epsilon: 1e-5 50 | ffn_norm_config: 51 | norm_type: rms_norm 52 | config: 53 | ndim: ${model_raw.config.n_embd} 54 | bias: true 55 | epsilon: 1e-5 56 | lm_head_norm_config: 57 | norm_type: rms_norm 58 | config: 59 | ndim: ${model_raw.config.n_embd} 60 | bias: true 61 | epsilon: 1e-5 62 | use_weight_tying: true 63 | -------------------------------------------------------------------------------- /tests/checkpointing/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/checkpointing/pytorch/__init__.py -------------------------------------------------------------------------------- /tests/checkpointing/pytorch/test_torch_checkpoint_loading.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from modalities.checkpointing.torch.torch_checkpoint_loading import TorchCheckpointLoading 6 | from modalities.config.config import PrecisionEnum 7 | 8 | 9 | class DummyModel(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | self._weights = nn.Linear(2, 3) 13 | 14 | def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: 15 | output = self._weights(**inputs) 16 | return {"output": output} 17 | 18 | 19 | @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.") 20 | def test_load_model_checkpoint(tmp_path): 21 | # After storing the state_dict on disc, the model state does not 22 | # contain any information about the device or precision 23 | tmp_file_path = tmp_path / "model_state.pth" 24 | 25 | # model that we checkpoint 26 | model_1 = DummyModel().to(dtype=PrecisionEnum.BF16.value) 27 | 28 | # models that we load the checkpoint into 29 | model_2 = DummyModel().to(dtype=PrecisionEnum.FP16.value) 30 | model_3 = DummyModel().to(dtype=PrecisionEnum.FP16.value) 31 | 32 | # perform checkpointing 33 | model_state = model_1.state_dict() 34 | torch.save(model_state, tmp_file_path) 35 | 36 | # load the model checkpoint with different settings 37 | gpu_device = torch.device("cuda:0") 38 | loaded_model_1: DummyModel = TorchCheckpointLoading( 39 | device=gpu_device, precision=PrecisionEnum.FP32 40 | ).load_model_checkpoint(model_2, tmp_file_path) 41 | 42 | assert torch.equal(model_1._weights.weight.to(gpu_device), loaded_model_1._weights.weight) 43 | assert torch.equal(model_1._weights.bias.to(gpu_device), loaded_model_1._weights.bias) 44 | 45 | # since we provided the precision, the model will be loaded with the specified precision 46 | # even if the state dict contains a different precision. 47 | assert loaded_model_1._weights.weight.dtype == torch.float32 48 | assert loaded_model_1._weights.weight.device == gpu_device 49 | 50 | # if we don't specify the precision, the model will be loaded with the precision of the state dict. 51 | # In this case, BF16 is used as defined for model_1. 52 | loaded_model_2: DummyModel = TorchCheckpointLoading(device=gpu_device).load_model_checkpoint(model_3, tmp_file_path) 53 | assert loaded_model_2._weights.weight.dtype == torch.bfloat16 54 | -------------------------------------------------------------------------------- /tests/checkpointing/test_checkpoint_execution_functions.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import torch.nn as nn 5 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 6 | 7 | from modalities.checkpointing.fsdp.fsdp_checkpoint_saving import FSDP1CheckpointSaving 8 | from modalities.training.training_progress import TrainingProgress 9 | 10 | 11 | @pytest.mark.skip 12 | def dummy_method(module: nn.Module, flag: bool) -> FSDP: 13 | raise NotImplementedError 14 | 15 | 16 | @pytest.mark.skip 17 | def is_empty_directory(folder_path: str) -> bool: 18 | path = Path(folder_path) 19 | return not any(path.iterdir()) 20 | 21 | 22 | CONTENT = "model" 23 | 24 | 25 | def test_get_paths_to_delete(tmp_path): # pytest temp path 26 | checkpointing = FSDP1CheckpointSaving( 27 | checkpoint_path=tmp_path, 28 | experiment_id=str(1), 29 | global_rank=0, 30 | ) 31 | trining_progress = TrainingProgress( 32 | num_seen_tokens_current_run=5, num_seen_steps_current_run=10, num_target_tokens=40, num_target_steps=20 33 | ) 34 | 35 | files_paths_to_delete = checkpointing._get_paths_to_delete(training_progress=trining_progress) 36 | assert len(files_paths_to_delete) == 2 37 | 38 | 39 | def test_delete_checkpoint(tmpdir): 40 | experiment_id = "2022-05-07__14-31-22" 41 | training_progress = TrainingProgress( 42 | num_seen_tokens_current_run=5, num_seen_steps_current_run=10, num_target_tokens=40, num_target_steps=20 43 | ) 44 | directory = Path(tmpdir) 45 | 46 | (directory / experiment_id).mkdir(exist_ok=True) 47 | optimizer_file_name = ( 48 | f"eid_{experiment_id}-optimizer-seen_steps_{training_progress.num_seen_steps_total}" 49 | f"-seen_tokens_{training_progress.num_seen_tokens_total}" 50 | f"-target_steps_{training_progress.num_target_steps}" 51 | f"-target_tokens_{training_progress.num_target_tokens}.bin" 52 | ) 53 | optimizer_path = directory / experiment_id / optimizer_file_name 54 | optimizer_path.write_text(CONTENT) 55 | 56 | model_file_name = ( 57 | f"eid_{experiment_id}-model-seen_steps_{training_progress.num_seen_steps_total}" 58 | f"-seen_tokens_{training_progress.num_seen_tokens_total}" 59 | f"-target_steps_{training_progress.num_target_steps}" 60 | f"-target_tokens_{training_progress.num_target_tokens}.bin" 61 | ) 62 | model_path = directory / experiment_id / model_file_name 63 | model_path.write_text(CONTENT) 64 | 65 | checkpoint_saving = FSDP1CheckpointSaving( 66 | checkpoint_path=directory, 67 | experiment_id=experiment_id, 68 | global_rank=0, 69 | ) 70 | checkpoint_saving._delete_checkpoint(training_progress=training_progress) 71 | assert is_empty_directory((directory / experiment_id).__str__()) 72 | -------------------------------------------------------------------------------- /tests/checkpointing/test_checkpoint_strategies.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from modalities.checkpointing.checkpoint_saving_strategies import SaveKMostRecentCheckpointsStrategy 4 | from modalities.training.training_progress import TrainingProgress 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "k, saved_instances, checkpoints_to_delete, save_current", 9 | [ 10 | # k value is 2. New checkpoint is created and the last one (in the example: [2]) is deleted. 11 | (2, [TrainingProgress(2, 2, 20, 20), TrainingProgress(1, 1, 20, 20)], [TrainingProgress(1, 1, 20, 20)], True), 12 | # k value is 0. No deletion of checkpoints. 13 | (0, [], [], False), 14 | # k value is 2, but there are currently only one checkpoint. Hence, no deletion. 15 | (2, [TrainingProgress(1, 1, 20, 20)], [], True), 16 | # k value is -1, therefore we want to keep all checkpoints without any deletion 17 | ( 18 | -1, 19 | [TrainingProgress(3, 3, 20, 20), TrainingProgress(2, 2, 20, 20), TrainingProgress(1, 1, 20, 20)], 20 | [], 21 | True, 22 | ), 23 | ], 24 | ) 25 | def test_checkpoint_strategy_k( 26 | k: int, saved_instances: list[TrainingProgress], checkpoints_to_delete: list[int], save_current: bool 27 | ) -> None: 28 | num_seen_steps_current_run = 10 29 | training_progress = TrainingProgress( 30 | num_seen_steps_current_run=num_seen_steps_current_run, 31 | num_seen_tokens_current_run=10, 32 | num_target_steps=20, 33 | num_target_tokens=40, 34 | ) 35 | checkpoint_strategy = SaveKMostRecentCheckpointsStrategy(k=k) 36 | checkpoint_strategy.saved_step_checkpoints = saved_instances 37 | checkpoint_instruction = checkpoint_strategy.get_checkpoint_instruction(training_progress=training_progress) 38 | 39 | assert checkpoint_instruction.checkpoints_to_delete == checkpoints_to_delete 40 | assert checkpoint_instruction.save_current == save_current 41 | 42 | # make sure that modifying the training progress externally does not affect saved_step_checkpoints 43 | if k != 0 and save_current: 44 | training_progress.num_seen_steps_current_run = 100 45 | assert checkpoint_strategy.saved_step_checkpoints[0].num_seen_steps_current_run == num_seen_steps_current_run 46 | -------------------------------------------------------------------------------- /tests/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/config/__init__.py -------------------------------------------------------------------------------- /tests/config/components.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Component_V_W_X_IF: 5 | def print(self) -> None: 6 | print("ComponentIF") 7 | 8 | 9 | # Dependencies 10 | 11 | 12 | class ComponentV(Component_V_W_X_IF): 13 | def __init__(self, val_v: str) -> None: 14 | self.val_v = val_v 15 | 16 | 17 | class ComponentW(Component_V_W_X_IF): 18 | def __init__(self, val_w: str) -> None: 19 | self.val_w = val_w 20 | 21 | 22 | # Components 23 | 24 | 25 | class ComponentX(Component_V_W_X_IF): 26 | def __init__(self, val_x: str, single_dependency: Component_V_W_X_IF) -> None: 27 | self.val_x = val_x 28 | self.single_dependency = single_dependency 29 | 30 | 31 | class ComponentY: 32 | def __init__(self, val_y: str, multi_dependency: list[Component_V_W_X_IF]) -> None: 33 | self.val_y = val_y 34 | self.multi_dependency = multi_dependency 35 | 36 | 37 | class ComponentZ: 38 | def __init__(self, val_z: str) -> None: 39 | self.val_z = val_z 40 | 41 | 42 | class ComponentTypes(Enum): 43 | COMP_V = ComponentV 44 | COMP_W = ComponentW 45 | COMP_X = ComponentX 46 | COMP_Y = ComponentY 47 | COMP_Z = ComponentZ 48 | -------------------------------------------------------------------------------- /tests/config/configs.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from pydantic import BaseModel 4 | 5 | from modalities.config.pydantic_if_types import PydanticThirdPartyTypeIF 6 | from tests.config.components import Component_V_W_X_IF 7 | 8 | PydanticComponent_V_W_X_IF_Type = Annotated[Component_V_W_X_IF, PydanticThirdPartyTypeIF(Component_V_W_X_IF)] 9 | 10 | 11 | class CompVConfig(BaseModel): 12 | val_v: str 13 | 14 | 15 | class CompWConfig(BaseModel): 16 | val_w: str 17 | 18 | 19 | class CompXConfig(BaseModel): 20 | val_x: str 21 | single_dependency: PydanticComponent_V_W_X_IF_Type 22 | 23 | 24 | class CompYConfig(BaseModel): 25 | val_y: str 26 | multi_dependency: list[PydanticComponent_V_W_X_IF_Type] 27 | 28 | 29 | class CompZConfig(BaseModel): 30 | val_z: str 31 | -------------------------------------------------------------------------------- /tests/config/custom_components.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from enum import Enum 3 | from typing import Literal 4 | 5 | from pydantic import BaseModel, validator 6 | 7 | 8 | class CustomComponent1: 9 | def __init__(self, val_1: str) -> None: 10 | self.val_1 = val_1 11 | 12 | 13 | class CustomComponentTypes(Enum): 14 | CUSTOM_COMP_1 = CustomComponent1 15 | 16 | 17 | class CustomCompConfigABC(BaseModel, ABC): 18 | # TODO make this a string and then implement the mapping 19 | # to the class outside of the basemodel (i.e. in the factory) 20 | type_hint: Enum 21 | 22 | @validator("type_hint", pre=True, allow_reuse=True, check_fields=False) 23 | def _string_to_enum(cls, key: str): 24 | if isinstance(key, str): 25 | try: 26 | key = CustomComponentTypes[key] 27 | except KeyError as e: 28 | raise ValueError(f"{key} is not a valid ComponentType") from e 29 | return key 30 | return key 31 | 32 | 33 | class CustomComp1Config(CustomCompConfigABC): 34 | type_hint: Literal[CustomComponentTypes.CUSTOM_COMP_1] 35 | val_1: str 36 | -------------------------------------------------------------------------------- /tests/config/test_configs/config_backward_reference.yaml: -------------------------------------------------------------------------------- 1 | comp_x_1: 2 | component_key: COMP_X 3 | variant_key: default 4 | config: 5 | val_x: "some other value X" 6 | single_dependency: 7 | component_key: COMP_W 8 | variant_key: default 9 | config: 10 | val_w: "some other value w" 11 | 12 | comp_y_1: 13 | component_key: COMP_Y 14 | variant_key: default 15 | config: 16 | val_y: "some other value y" 17 | multi_dependency: 18 | - component_key: COMP_W 19 | variant_key: default 20 | config: 21 | val_w: "some other value w" 22 | - component_key: COMP_V 23 | variant_key: default 24 | config: 25 | val_v: "some other value v" 26 | - instance_key: comp_x_1 27 | pass_type: BY_REFERENCE 28 | 29 | -------------------------------------------------------------------------------- /tests/config/test_configs/config_forward_reference.yaml: -------------------------------------------------------------------------------- 1 | comp_y_1: 2 | component_key: COMP_Y 3 | variant_key: default 4 | config: 5 | val_y: "some other value y" 6 | multi_dependency: 7 | - component_key: COMP_W 8 | variant_key: default 9 | config: 10 | val_w: "some other value w" 11 | - component_key: COMP_V 12 | variant_key: default 13 | config: 14 | val_v: "some other value v" 15 | - instance_key: comp_x_1 16 | pass_type: BY_REFERENCE 17 | 18 | comp_x_1: 19 | component_key: COMP_X 20 | variant_key: default 21 | config: 22 | val_x: "some other value X" 23 | single_dependency: 24 | component_key: COMP_W 25 | variant_key: default 26 | config: 27 | val_w: "some other value w" -------------------------------------------------------------------------------- /tests/config/test_configs/config_hierarchical_list_component.yaml: -------------------------------------------------------------------------------- 1 | 2 | comp_y_1: 3 | component_key: COMP_Y 4 | variant_key: default 5 | config: 6 | val_y: "some other value y" 7 | multi_dependency: 8 | - component_key: COMP_W 9 | variant_key: default 10 | config: 11 | val_w: "some other value w" 12 | - component_key: COMP_V 13 | variant_key: default 14 | config: 15 | val_v: "some other value v" -------------------------------------------------------------------------------- /tests/config/test_configs/config_multiple_top_level_components_with_references.yaml: -------------------------------------------------------------------------------- 1 | # we want to test that comp_x_1->val_x is not referencing 2 | # top-level component val_x 3 | val_x: 4 | component_key: COMP_X 5 | variant_key: default 6 | config: 7 | val_x: "val_x -> config -> val_x" 8 | single_dependency: 9 | component_key: COMP_W 10 | variant_key: default 11 | config: 12 | val_w: "val_w_123" 13 | 14 | single_dependency: 15 | component_key: COMP_W 16 | variant_key: default 17 | config: 18 | val_w: "single_dependency -> config -> val_w" 19 | 20 | comp_x_1: 21 | component_key: COMP_X 22 | variant_key: default 23 | config: 24 | val_x: "comp_x_1 -> config -> val_x" 25 | single_dependency: 26 | component_key: COMP_W 27 | variant_key: default 28 | config: 29 | val_w: "val_w_123" 30 | 31 | 32 | # we want to check that comp_x_2 and comp_x_3 are pointing 33 | # to the same instance of comp_w_1 and 34 | # comp_x_1 is not pointing to comp_w_1 35 | comp_x_2: 36 | component_key: COMP_X 37 | variant_key: default 38 | config: 39 | val_x: "comp_x_2 -> config -> val_x" 40 | single_dependency: 41 | instance_key: comp_w_1 42 | pass_type: BY_REFERENCE 43 | 44 | comp_w_1: 45 | component_key: COMP_W 46 | variant_key: default 47 | config: 48 | val_w: "comp_w_1 -> comp_w" 49 | 50 | comp_x_3: 51 | component_key: COMP_X 52 | variant_key: default 53 | config: 54 | val_x: "comp_x_3 -> config -> val_x" 55 | single_dependency: 56 | instance_key: comp_w_1 57 | pass_type: BY_REFERENCE 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /tests/config/test_configs/config_non_existing_reference.yaml: -------------------------------------------------------------------------------- 1 | comp_y_1: 2 | component_key: COMP_Y 3 | variant_key: default 4 | config: 5 | val_y: "some other value y" 6 | multi_dependency: 7 | - component_key: COMP_W 8 | variant_key: default 9 | config: 10 | val_w: "some other value w" 11 | - component_key: COMP_V 12 | variant_key: default 13 | config: 14 | val_v: "some other value v" 15 | - instance_key: comp_x_1 16 | pass_type: BY_REFERENCE 17 | 18 | -------------------------------------------------------------------------------- /tests/config/test_configs/config_single_component.yaml: -------------------------------------------------------------------------------- 1 | custom_comp_1: 2 | component_key: COMP_V 3 | variant_key: default 4 | config: 5 | val_v: "some value v" -------------------------------------------------------------------------------- /tests/conversion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/conversion/__init__.py -------------------------------------------------------------------------------- /tests/conversion/gpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/conversion/gpt2/__init__.py -------------------------------------------------------------------------------- /tests/conversion/gpt2/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from pathlib import Path 4 | 5 | import pytest 6 | import torch 7 | 8 | from modalities.config.config import load_app_config_dict 9 | from modalities.models.gpt2.gpt2_model import GPT2LLM 10 | from modalities.models.utils import ModelTypeEnum, get_model_from_config 11 | from tests.conftest import _ROOT_DIR 12 | 13 | 14 | @pytest.fixture 15 | def gpt2_config_path(tmpdir_factory: pytest.TempdirFactory, initialized_model: GPT2LLM, config_file_path: str) -> str: 16 | tmp_path = tmpdir_factory.mktemp("gpt2_model") 17 | new_config_filename = tmp_path / "gpt2_config_test.yaml" 18 | model_path = tmp_path / "model.pth" 19 | shutil.copy(config_file_path, new_config_filename) 20 | torch.save(initialized_model.state_dict(), model_path) 21 | with open(new_config_filename, "r") as file: 22 | content = file.read() 23 | content = content.replace("checkpoint_path: null", f"checkpoint_path: {model_path}") 24 | with open(new_config_filename, "w") as file: 25 | file.write(content) 26 | return str(new_config_filename) 27 | 28 | 29 | @pytest.fixture() 30 | def initialized_model(set_env, modalities_config_dict: dict) -> GPT2LLM: 31 | model = get_model_from_config(config=modalities_config_dict, model_type=ModelTypeEnum.MODEL) 32 | assert isinstance(model, GPT2LLM) 33 | return model 34 | 35 | 36 | @pytest.fixture() 37 | def set_env(): 38 | os.environ["LOCAL_RANK"] = "0" 39 | os.environ["RANK"] = "0" 40 | os.environ["WORLD_SIZE"] = "1" 41 | 42 | 43 | @pytest.fixture() 44 | def modalities_config_dict(config_file_path: Path) -> dict: 45 | return load_app_config_dict(config_file_path=config_file_path) 46 | 47 | 48 | @pytest.fixture() 49 | def config_file_path(config_file_name: str) -> Path: 50 | config_file_path = _ROOT_DIR / Path("tests/conversion/test_configs/" + config_file_name) 51 | return config_file_path 52 | 53 | 54 | @pytest.fixture(params=["gpt2_config_test.yaml"]) 55 | def config_file_name(request) -> str: 56 | return request.param 57 | -------------------------------------------------------------------------------- /tests/conversion/gpt2/helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM 5 | from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block 6 | 7 | 8 | def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM): 9 | converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device) 10 | assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight) 11 | for i, (llama_layer, modalities_layer) in enumerate( 12 | zip(converted_model.model.layers, modalities_model.transformer.h) 13 | ): 14 | check_same_weight_attention(llama_layer, modalities_layer) 15 | check_same_weight_mlp(llama_layer, modalities_layer) 16 | check_same_weight_layer_norms(llama_layer, modalities_layer) 17 | check_same_weight_base_modules(converted_model.lm_head, modalities_model.transformer.lm_head) 18 | check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm) 19 | 20 | 21 | def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block): 22 | check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn) 23 | check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn) 24 | check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn) 25 | check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj) 26 | 27 | 28 | def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block): 29 | check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2) 30 | check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W) 31 | check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V) 32 | 33 | 34 | def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block): 35 | check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm) 36 | check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm) 37 | 38 | 39 | def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm): 40 | assert torch.equal(l1.weight, l2.weight) 41 | assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias) 42 | -------------------------------------------------------------------------------- /tests/conversion/gpt2/test_conversion_code.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from modalities.conversion.gpt2.conversion_code import transfer_model_code 4 | 5 | 6 | def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path): 7 | modeling_gpt2_path = tmp_path / "modeling_gpt2.py" 8 | assert not modeling_gpt2_path.exists() 9 | transfer_model_code(str(tmp_path)) 10 | assert modeling_gpt2_path.exists() 11 | 12 | 13 | def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path): 14 | configuration_gpt2_path = tmp_path / "configuration_gpt2.py" 15 | assert not configuration_gpt2_path.exists() 16 | transfer_model_code(str(tmp_path)) 17 | assert configuration_gpt2_path.exists() 18 | 19 | 20 | def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path): 21 | transfer_model_code(str(tmp_path)) 22 | with open(tmp_path / "modeling_gpt2.py") as f: 23 | text = f.read() 24 | assert "from modalities" not in text 25 | assert "import modalities" not in text 26 | 27 | 28 | def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path): 29 | transfer_model_code(str(tmp_path)) 30 | with open(tmp_path / "configuration_gpt2.py") as f: 31 | text = f.read() 32 | assert "from modalities" not in text 33 | assert "import modalities" not in text 34 | -------------------------------------------------------------------------------- /tests/conversion/gpt2/test_conversion_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from modalities.config.config import load_app_config_dict 6 | from modalities.conversion.gpt2.conversion_model import ( 7 | _copy_weights_base_modules, 8 | check_converted_model, 9 | convert_model_checkpoint, 10 | ) 11 | from tests.conversion.gpt2.helper import check_same_weight_base_modules, check_same_weight_model 12 | 13 | 14 | def test_convert_model_can_generate(gpt2_config_path: str): 15 | modalities_config = load_app_config_dict(gpt2_config_path) 16 | hf_model, _ = convert_model_checkpoint(modalities_config) 17 | assert hf_model.can_generate() 18 | 19 | 20 | def test_convert_model_checkpoint_does_not_change_weights(gpt2_config_path: str): 21 | modalities_config = load_app_config_dict(gpt2_config_path) 22 | hf_model, modalities_model = convert_model_checkpoint(modalities_config) 23 | check_same_weight_model(hf_model, modalities_model) 24 | 25 | 26 | def test_convert_model_checkpoint_produces_same_logits_as_original(gpt2_config_path: str): 27 | modalities_config = load_app_config_dict(gpt2_config_path) 28 | hf_model, modalities_model = convert_model_checkpoint(modalities_config) 29 | vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"] 30 | check_converted_model(hf_model, modalities_model, num_testruns=1, vocab_size=vocab_size) 31 | 32 | 33 | def test_copying_base_modules_weights_yields_identical_modules(): 34 | m1 = nn.Linear(10, 10, bias=True) 35 | m2 = nn.Linear(10, 10, bias=True) 36 | m2.weight.data = torch.randn(10, 10) 37 | m2.bias.data = torch.randn(10) 38 | 39 | _copy_weights_base_modules(m1, m2) 40 | 41 | check_same_weight_base_modules(m1, m2) 42 | 43 | 44 | def test_copying_base_modules_works_when_bias_is_false(): 45 | m1 = nn.Linear(10, 10, bias=False) 46 | m2 = nn.Linear(10, 10, bias=False) 47 | m2.weight.data = torch.randn(10, 10) 48 | 49 | _copy_weights_base_modules(m1, m2) 50 | 51 | check_same_weight_base_modules(m1, m2) 52 | 53 | 54 | def test_copying_base_modules_fails_if_bias_settings_mismatch(): 55 | m1 = nn.Linear(10, 10, bias=False) 56 | m2 = nn.Linear(10, 10, bias=True) 57 | m2.weight.data = torch.randn(10, 10) 58 | m2.bias.data = torch.randn(10) 59 | 60 | with pytest.raises(AttributeError): 61 | _copy_weights_base_modules(m1, m2) 62 | -------------------------------------------------------------------------------- /tests/conversion/gpt2/test_conversion_tokenizer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | from transformers import LlamaTokenizer 5 | 6 | from modalities.conversion.gpt2.conversion_tokenizer import convert_tokenizer 7 | from modalities.tokenization.tokenizer_wrapper import PreTrainedSPTokenizer 8 | 9 | 10 | def test_converted_tokenizer_produces_same_tokens_as_original( 11 | converted_tokenizer: LlamaTokenizer, sp_tokenizer: PreTrainedSPTokenizer, text: str 12 | ): 13 | converted_token_ids = converted_tokenizer(text) 14 | sp_token_ids = sp_tokenizer.tokenize(text) 15 | assert converted_token_ids["input_ids"] == sp_token_ids, "Converted token IDs do not match original token IDs." 16 | 17 | 18 | def test_converted_tokenizer_detokenizes_same_as_original( 19 | converted_tokenizer: LlamaTokenizer, sp_tokenizer: PreTrainedSPTokenizer, token_ids: list[int] 20 | ): 21 | converted_tokens = converted_tokenizer.decode(token_ids) 22 | sp_tokens = sp_tokenizer.decode(token_ids) 23 | assert converted_tokens == sp_tokens, "Decoded tokens do not match between converted and original tokenizers." 24 | 25 | 26 | @pytest.fixture 27 | def converted_tokenizer(tmp_path: Path, tokenizer_model_file: str) -> LlamaTokenizer: 28 | convert_tokenizer(tokenizer_model_path=tokenizer_model_file, output_dir=tmp_path) 29 | return LlamaTokenizer.from_pretrained(tmp_path) 30 | 31 | 32 | @pytest.fixture 33 | def sp_tokenizer(tokenizer_model_file: str) -> PreTrainedSPTokenizer: 34 | return PreTrainedSPTokenizer(tokenizer_model_file=tokenizer_model_file) 35 | 36 | 37 | @pytest.fixture 38 | def tokenizer_model_file() -> str: 39 | return "data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model" 40 | 41 | 42 | @pytest.fixture( 43 | params=[ 44 | "", 45 | "Hello,\n my dog is cute", 46 | "the secret phrase is ossifrage", 47 | ] 48 | ) 49 | def text(request: pytest.FixtureRequest) -> str: 50 | return request.param 51 | 52 | 53 | @pytest.fixture( 54 | params=[ 55 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 56 | [0, 20527, 1, 20527, 2, 20527, 3, 20527, 4, 20527, 5, 20527, 6, 20527], 57 | [1, 20527], 58 | ] 59 | ) 60 | def token_ids(request: pytest.FixtureRequest) -> list[int]: 61 | return request.param 62 | -------------------------------------------------------------------------------- /tests/conversion/gpt2/test_convert_gpt2.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import torch 5 | from transformers import AutoModelForCausalLM, PreTrainedModel 6 | 7 | from modalities.config.config import load_app_config_dict 8 | from modalities.conversion.gpt2.conversion_model import check_converted_model 9 | from modalities.conversion.gpt2.convert_gpt2 import convert_gpt2 10 | from modalities.models.gpt2.gpt2_model import GPT2LLM 11 | from modalities.models.utils import ModelTypeEnum, get_model_from_config 12 | from tests.conversion.gpt2.helper import check_same_weight_model 13 | 14 | 15 | def test_converting_gpt2_does_not_change_weights(converted_model: PreTrainedModel, original_model: GPT2LLM): 16 | check_same_weight_model(converted_model, original_model) 17 | 18 | 19 | def test_converting_gpt2_does_not_change_outputs( 20 | converted_model: PreTrainedModel, original_model: GPT2LLM, vocab_size: int 21 | ): 22 | check_converted_model( 23 | hf_model=converted_model, modalities_model=original_model, num_testruns=1, vocab_size=vocab_size 24 | ) 25 | 26 | 27 | @pytest.fixture 28 | def converted_model(run_convert_gpt2: None, output_dir: Path) -> PreTrainedModel: 29 | return AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True).to( 30 | dtype=torch.bfloat16 31 | ) 32 | 33 | 34 | @pytest.fixture 35 | def run_convert_gpt2(gpt2_config_path: str, output_dir: Path): 36 | convert_gpt2(gpt2_config_path, output_dir) 37 | 38 | 39 | @pytest.fixture 40 | def original_model(gpt2_config_path: str) -> GPT2LLM: 41 | modalities_config = load_app_config_dict(gpt2_config_path) 42 | return get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL) 43 | 44 | 45 | @pytest.fixture 46 | def vocab_size(gpt2_config_path: str) -> int: 47 | modalities_config = load_app_config_dict(gpt2_config_path) 48 | return modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"] 49 | 50 | 51 | @pytest.fixture 52 | def output_dir(tmp_path: Path) -> Path: 53 | return tmp_path / "output" 54 | -------------------------------------------------------------------------------- /tests/conversion/test_configs/gpt2_config_test.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: gpt2 4 | config: 5 | sample_key: input_ids 6 | poe_type: NOPE 7 | sequence_length: 128 8 | prediction_key: logits 9 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 10 | n_layer: 3 11 | n_head_q: 4 12 | n_head_kv: 4 13 | ffn_hidden: 512 14 | n_embd: 256 15 | dropout: 0.0 16 | bias: false # True: bias in Linears, like GPT-2. False: a bit better and faster 17 | attention_config: 18 | qkv_transforms: 19 | - type_hint: RotaryTransform 20 | config: 21 | n_embd: ${model.config.n_embd} 22 | n_head: ${model.config.n_head_q} #it has to be head_q here 23 | seq_length_dim: -2 24 | base_freq: 500000 25 | attention_implementation: pytorch_flash # manual 26 | activation_type: swiglu 27 | attention_norm_config: 28 | norm_type: layer_norm 29 | config: 30 | normalized_shape: ${model.config.n_embd} 31 | eps: 1e-5 32 | bias: true 33 | ffn_norm_config: 34 | norm_type: layer_norm 35 | config: 36 | normalized_shape: ${model.config.n_embd} 37 | eps: 1e-5 38 | bias: true 39 | lm_head_norm_config: 40 | norm_type: layer_norm 41 | config: 42 | normalized_shape: ${model.config.n_embd} 43 | eps: 1e-5 44 | bias: true 45 | use_weight_tying: true 46 | 47 | checkpointed_model: 48 | component_key: model 49 | variant_key: fsdp1_checkpointed 50 | config: 51 | checkpoint_loading: 52 | component_key: checkpoint_loading 53 | variant_key: torch 54 | config: 55 | device: cpu 56 | precision: BF16 57 | model: 58 | instance_key: model 59 | pass_type: BY_REFERENCE 60 | checkpoint_path: null -------------------------------------------------------------------------------- /tests/data/datasets/lorem_ipsum_long.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/data/datasets/lorem_ipsum_long.idx -------------------------------------------------------------------------------- /tests/data/datasets/lorem_ipsum_long.pbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/data/datasets/lorem_ipsum_long.pbin -------------------------------------------------------------------------------- /tests/dataloader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/__init__.py -------------------------------------------------------------------------------- /tests/dataloader/distributed/dist_dataloader_config_with_shuffling.yaml: -------------------------------------------------------------------------------- 1 | train_dataset: 2 | component_key: dataset 3 | variant_key: test 4 | config: 5 | num_samples: 8 6 | 7 | train_dataloader: 8 | component_key: data_loader 9 | variant_key: default 10 | config: 11 | num_workers: 2 12 | pin_memory: true 13 | dataloader_tag: train 14 | dataset: 15 | instance_key: train_dataset 16 | pass_type: BY_REFERENCE 17 | batch_sampler: 18 | component_key: batch_sampler 19 | variant_key: default 20 | config: 21 | batch_size: 2 22 | drop_last: true 23 | sampler: 24 | component_key: sampler 25 | variant_key: resumable_distributed_sampler 26 | config: 27 | rank: ${cuda_env:RANK} 28 | num_replicas: ${cuda_env:WORLD_SIZE} 29 | shuffle: true 30 | seed: 0 31 | skip_num_global_samples: 0 32 | dataset: 33 | instance_key: train_dataset 34 | pass_type: BY_REFERENCE -------------------------------------------------------------------------------- /tests/dataloader/distributed/dist_dataloader_config_with_shuffling_and_skipped_batches.yaml: -------------------------------------------------------------------------------- 1 | train_dataset: 2 | component_key: dataset 3 | variant_key: test 4 | config: 5 | num_samples: 8 6 | 7 | train_dataloader: 8 | component_key: data_loader 9 | variant_key: default 10 | config: 11 | num_workers: 2 12 | pin_memory: true 13 | dataloader_tag: train 14 | dataset: 15 | instance_key: train_dataset 16 | pass_type: BY_REFERENCE 17 | batch_sampler: 18 | component_key: batch_sampler 19 | variant_key: default 20 | config: 21 | batch_size: 2 22 | drop_last: true 23 | sampler: 24 | component_key: sampler 25 | variant_key: resumable_distributed_sampler 26 | config: 27 | rank: ${cuda_env:RANK} 28 | num_replicas: ${cuda_env:WORLD_SIZE} 29 | shuffle: true 30 | seed: 0 31 | skip_num_global_samples: 4 # num_batches (1) * world_size (2) * local_micro_batch_size (2) 32 | dataset: 33 | instance_key: train_dataset 34 | pass_type: BY_REFERENCE -------------------------------------------------------------------------------- /tests/dataloader/distributed/dist_dataloader_config_without_shuffling.yaml: -------------------------------------------------------------------------------- 1 | train_dataset: 2 | component_key: dataset 3 | variant_key: test 4 | config: 5 | num_samples: 8 6 | 7 | train_dataloader: 8 | component_key: data_loader 9 | variant_key: default 10 | config: 11 | num_workers: 2 12 | pin_memory: true 13 | dataloader_tag: train 14 | dataset: 15 | instance_key: train_dataset 16 | pass_type: BY_REFERENCE 17 | batch_sampler: 18 | component_key: batch_sampler 19 | variant_key: default 20 | config: 21 | batch_size: 2 22 | drop_last: true 23 | sampler: 24 | component_key: sampler 25 | variant_key: resumable_distributed_sampler 26 | config: 27 | rank: ${cuda_env:RANK} 28 | num_replicas: ${cuda_env:WORLD_SIZE} 29 | shuffle: false 30 | skip_num_global_samples: 0 31 | dataset: 32 | instance_key: train_dataset 33 | pass_type: BY_REFERENCE -------------------------------------------------------------------------------- /tests/dataloader/dummy_sequential_dataset.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from torch.utils.data.dataset import Dataset as TorchdataSet 3 | 4 | 5 | class TestDataset(TorchdataSet): 6 | def __init__(self, num_samples: int): 7 | self.samples = list(range(num_samples)) 8 | 9 | def __len__(self) -> int: 10 | return len(self.samples) 11 | 12 | def __getitem__(self, idx: int) -> dict: 13 | return self.samples[idx] 14 | 15 | 16 | class TestDatasetConfig(BaseModel): 17 | num_samples: int 18 | -------------------------------------------------------------------------------- /tests/dataloader/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/__init__.py -------------------------------------------------------------------------------- /tests/dataloader/preprocessing/chunking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/chunking/__init__.py -------------------------------------------------------------------------------- /tests/dataloader/preprocessing/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/preprocessing/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/dataloader/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/dataloader/samplers/__init__.py -------------------------------------------------------------------------------- /tests/dataloader/samplers/test_sequential_samplers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from torch.utils.data import Dataset, SequentialSampler 3 | 4 | 5 | class DummyDataset(Dataset): 6 | def __init__(self, num_samples): 7 | self.data = list(range(num_samples)) 8 | 9 | def __len__(self): 10 | return len(self.data) 11 | 12 | def __getitem__(self, index): 13 | return self.data[index] 14 | 15 | 16 | @pytest.mark.parametrize( 17 | "num_samples, world_size", 18 | [ 19 | (10, 3), 20 | (15, 4), 21 | ], 22 | ) 23 | def test_distributed_setting(num_samples, world_size): 24 | dataset = DummyDataset(num_samples) 25 | samplers = [SequentialSampler(dataset) for _ in range(world_size)] 26 | 27 | expected_indices = list(range(num_samples)) 28 | # Ensures that all ranks receive the exact same samples in the same order 29 | assert all(list(sampler) == expected_indices for sampler in samplers) 30 | -------------------------------------------------------------------------------- /tests/dataloader/test_combined_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from modalities.dataloader.dataset import CombinedDataset 4 | 5 | 6 | @pytest.fixture 7 | def dummy_dataset_1() -> list[int]: 8 | return list(range(10)) 9 | 10 | 11 | @pytest.fixture 12 | def dummy_dataset_2() -> list[int]: 13 | return list(range(10, 15)) 14 | 15 | 16 | def test_combined_dataset(dummy_dataset_1: list[int], dummy_dataset_2: list[int]): 17 | combined_dataset = CombinedDataset(datasets=[dummy_dataset_1, dummy_dataset_2]) 18 | 19 | # check that length is calculated correctly 20 | assert len(combined_dataset) == 15 21 | 22 | # check that the elements are iterated over in order 23 | assert [i for i in combined_dataset] == list(range(15)) 24 | 25 | # check that we throw an error when trying to access an index that is out of bounds 26 | with pytest.raises(IndexError): 27 | combined_dataset[15] 28 | -------------------------------------------------------------------------------- /tests/dataloader/test_dummy_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from modalities.dataloader.dataset import DummyDataset, DummySampleConfig, DummySampleDataType 4 | 5 | 6 | def test_dummy_dataset(): 7 | dataset = DummyDataset( 8 | num_samples=50, 9 | sample_definition=[ 10 | DummySampleConfig(sample_key="input_ids", sample_shape=(512,), sample_type=DummySampleDataType.INT), 11 | DummySampleConfig(sample_key="images", sample_shape=(3, 224, 224), sample_type=DummySampleDataType.FLOAT), 12 | ], 13 | ) 14 | assert len(dataset) == 50 15 | sample = next(iter(dataset)) 16 | assert "input_ids" in sample 17 | assert sample["input_ids"].shape == (512,) 18 | assert sample["input_ids"].dtype == np.int64 19 | assert "images" in sample 20 | assert sample["images"].shape == (3, 224, 224) 21 | assert sample["images"].dtype == np.float64 22 | -------------------------------------------------------------------------------- /tests/dataloader/yaml_configs/skipped_dataloader.yaml: -------------------------------------------------------------------------------- 1 | # NOTE, settings is not type checked in the instantiation model (specified within the test), as the settings are not used in the pydantic model. 2 | # Therefore, we can place arbitrary values in the settings field. 3 | 4 | settings: 5 | referencing_keys: 6 | sample_key: input_ids 7 | target_key: target_ids 8 | training: 9 | local_train_micro_batch_size: 2 10 | global_num_seen_tokens: 2048 11 | sequence_length: 256 12 | cuda_env: 13 | global_rank: 0 14 | world_size: 2 15 | 16 | collate_fn: 17 | component_key: collate_fn 18 | variant_key: gpt_2_llm_collator 19 | config: 20 | sample_key: ${settings.referencing_keys.sample_key} 21 | target_key: ${settings.referencing_keys.target_key} 22 | 23 | train_dataset: 24 | component_key: dataset 25 | variant_key: packed_mem_map_dataset_continuous 26 | config: 27 | raw_data_path: ./data/lorem_ipsum.pbin 28 | sequence_length: ${settings.training.sequence_length} 29 | sample_key: ${settings.referencing_keys.sample_key} 30 | 31 | skip_num_samples: 32 | component_key: number_conversion 33 | variant_key: num_samples_from_num_tokens 34 | config: 35 | num_tokens: ${settings.training.global_num_seen_tokens} 36 | sequence_length: ${settings.training.sequence_length} 37 | 38 | train_dataloader: 39 | component_key: data_loader 40 | variant_key: default 41 | config: 42 | num_workers: 2 43 | pin_memory: true 44 | dataloader_tag: train 45 | dataset: 46 | instance_key: train_dataset 47 | pass_type: BY_REFERENCE 48 | batch_sampler: 49 | component_key: batch_sampler 50 | variant_key: default 51 | config: 52 | batch_size: ${settings.training.local_train_micro_batch_size} 53 | drop_last: true 54 | sampler: 55 | component_key: sampler 56 | variant_key: resumable_distributed_sampler 57 | config: 58 | dataset: 59 | instance_key: train_dataset 60 | pass_type: BY_REFERENCE 61 | rank: ${settings.cuda_env.global_rank} 62 | num_replicas: ${settings.cuda_env.world_size} 63 | shuffle: false 64 | drop_last: true 65 | skip_num_global_samples: ${skip_num_samples} 66 | collate_fn: 67 | instance_key: collate_fn 68 | pass_type: BY_REFERENCE 69 | -------------------------------------------------------------------------------- /tests/end2end_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/__init__.py -------------------------------------------------------------------------------- /tests/end2end_tests/custom_components.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | from modalities.batch import EvaluationResultBatch 7 | from modalities.config.config import ProcessGroupBackendType 8 | from modalities.logging_broker.messages import Message 9 | from modalities.logging_broker.subscriber import MessageSubscriberIF 10 | from modalities.running_env.cuda_env import CudaEnv 11 | 12 | 13 | class SaveAllResultSubscriber(MessageSubscriberIF[EvaluationResultBatch]): 14 | def __init__(self): 15 | self.message_list: list[Message[EvaluationResultBatch]] = [] 16 | 17 | def consume_message(self, message: Message[EvaluationResultBatch]): 18 | """Consumes a message from a message broker.""" 19 | self.message_list.append(message) 20 | 21 | def consume_dict(self, message_dict: dict[str, Any]): 22 | pass 23 | 24 | 25 | class SaveAllResultSubscriberConfig(BaseModel): 26 | pass 27 | 28 | 29 | class MultiProcessingCudaEnv(CudaEnv): 30 | """Context manager to set the CUDA environment for distributed training.""" 31 | 32 | def __init__( 33 | self, 34 | process_group_backend: ProcessGroupBackendType, 35 | global_rank: int, 36 | local_rank: int, 37 | world_size: int, 38 | rdvz_port: int, 39 | ) -> None: 40 | super().__init__(process_group_backend=process_group_backend) 41 | self.global_rank = global_rank 42 | self.local_rank = local_rank 43 | self.world_size = world_size 44 | self.rdvz_port = rdvz_port 45 | self._original_env: dict[str, Optional[str]] = {} 46 | 47 | def __enter__(self): 48 | # Store original values 49 | for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "LOCAL_RANK", "WORLD_SIZE"]: 50 | self._original_env[key] = os.environ.get(key) 51 | 52 | # Set new environment variables 53 | os.environ["MASTER_ADDR"] = "localhost" 54 | os.environ["MASTER_PORT"] = str(self.rdvz_port) 55 | os.environ["RANK"] = str(self.global_rank) 56 | os.environ["LOCAL_RANK"] = str(self.local_rank) 57 | os.environ["WORLD_SIZE"] = str(self.world_size) 58 | 59 | # Initialize CUDA environment 60 | super().__enter__() 61 | return self 62 | 63 | def __exit__(self, exc_type, exc_val, exc_tb): 64 | # Restore original environment variables 65 | for key, value in self._original_env.items(): 66 | if value is None: 67 | os.environ.pop(key, None) 68 | else: 69 | os.environ[key] = value 70 | super().__exit__(exc_type, exc_val, exc_tb) 71 | -------------------------------------------------------------------------------- /tests/end2end_tests/lorem_ipsum.pbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/lorem_ipsum.pbin -------------------------------------------------------------------------------- /tests/end2end_tests/system_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/end2end_tests/system_tests/__init__.py -------------------------------------------------------------------------------- /tests/end2end_tests/test_shuffle_jsonl_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from modalities.api import FileExistencePolicy, shuffle_jsonl_data 8 | 9 | 10 | @pytest.fixture 11 | def data_rows() -> list[dict]: 12 | return [ 13 | {"file_id": "file_0.jsonl", "doc_id": "0"}, 14 | {"file_id": "file_0.jsonl", "doc_id": "1"}, 15 | {"file_id": "file_0.jsonl", "doc_id": "2"}, 16 | {"file_id": "file_0.jsonl", "doc_id": "3"}, 17 | {"file_id": "file_0.jsonl", "doc_id": "4"}, 18 | {"file_id": "file_0.jsonl", "doc_id": "5"}, 19 | {"file_id": "file_0.jsonl", "doc_id": "6"}, 20 | {"file_id": "file_0.jsonl", "doc_id": "7"}, 21 | {"file_id": "file_0.jsonl", "doc_id": "8"}, 22 | ] 23 | 24 | 25 | @pytest.fixture 26 | def input_data_path(data_rows: list[dict], tmp_path) -> Path: 27 | with open(tmp_path / "input.jsonl", "w", encoding="utf-8") as f: 28 | for row in data_rows: 29 | json.dump(row, f) 30 | f.write("\n") 31 | f.flush() 32 | return Path(f.name) 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "output_data_folder_path, file_existence_policy, seed", 37 | [ 38 | (Path(tempfile.mkdtemp()), FileExistencePolicy.ERROR, 42), 39 | ], 40 | ) 41 | def test_shuffle_jsonl_data( 42 | data_rows: list[dict], 43 | input_data_path: Path, 44 | output_data_folder_path: Path, 45 | file_existence_policy: FileExistencePolicy, 46 | seed: int, 47 | ): 48 | output_data_path = output_data_folder_path / "output.jsonl" 49 | shuffle_jsonl_data( 50 | input_data_path=input_data_path, 51 | output_data_path=output_data_path, 52 | file_existence_policy=file_existence_policy, 53 | seed=seed, 54 | ) 55 | 56 | with output_data_path.open("r", encoding="utf-8") as f: 57 | lines = f.readlines() 58 | rows_dict_shuffled = [json.loads(line) for line in lines] 59 | 60 | # Check that the shuffled data contains the same rows as the input data 61 | assert len(data_rows) > 0 62 | assert len(data_rows) == len(rows_dict_shuffled) 63 | assert any([row != row_shuffled for row, row_shuffled in zip(data_rows, rows_dict_shuffled)]) 64 | assert set([json.dumps(d) for d in data_rows]) == set([json.dumps(d) for d in rows_dict_shuffled]) 65 | -------------------------------------------------------------------------------- /tests/end2end_tests/test_shuffle_tokenized_data.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import tempfile 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from modalities.api import FileExistencePolicy, shuffle_tokenized_data 8 | from modalities.dataloader.dataset import PackedMemMapDatasetBase 9 | 10 | 11 | def _calculate_md5(file_path: Path): 12 | hash_md5 = hashlib.md5() 13 | with open(file_path, "rb") as f: 14 | for chunk in iter(lambda: f.read(4096), b""): 15 | hash_md5.update(chunk) 16 | return hash_md5.hexdigest() 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "tokenized_data_file_path, batch_size", 21 | [ 22 | (Path("tests/end2end_tests/lorem_ipsum.pbin"), 2), 23 | ], 24 | ) 25 | def test_shuffle_tokenized_data(tokenized_data_file_path: Path, batch_size: int): 26 | # temporary file 27 | md5sums = [] 28 | seeds = [1, 1, 2] 29 | file_paths = [] 30 | datasets = [] 31 | with tempfile.TemporaryDirectory() as temp_dir: 32 | for i in range(3): 33 | temp_file = Path(temp_dir) / f"shuffled_data_{i}.pbin" 34 | file_paths.append(temp_file) 35 | shuffle_tokenized_data( 36 | tokenized_data_file_path, 37 | output_data_path=temp_file, 38 | batch_size=batch_size, 39 | file_existence_policy=FileExistencePolicy.OVERRIDE, 40 | seed=seeds[i], 41 | ) 42 | md5sums.append(_calculate_md5(temp_file)) 43 | datasets.append(PackedMemMapDatasetBase(raw_data_path=temp_file, sample_key="text", load_index=True)) 44 | 45 | # check that the different seeds lead to different orderings 46 | # and that the same seed leads to the same ordering 47 | assert md5sums[0] == md5sums[1] 48 | assert md5sums[0] != md5sums[2] 49 | 50 | assert len(datasets[0]) == len(datasets[1]) == len(datasets[2]) 51 | for i in range(len(datasets[0])): 52 | assert all(datasets[0][i]["text"] == datasets[1][i]["text"]) 53 | 54 | # when we shuffle some lines might end up in the same place 55 | # in this test we make sure that at least one line is at a different place 56 | num_differing_lines = 0 57 | for i in range(len(datasets[0])): 58 | if len(datasets[0][i]["text"]) == len(datasets[2][i]["text"]): 59 | num_differing_lines += int(any(datasets[0][i]["text"] != datasets[2][i]["text"])) 60 | else: 61 | num_differing_lines += 1 62 | assert num_differing_lines > 0 63 | -------------------------------------------------------------------------------- /tests/fsdp2_parallelization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/fsdp2_parallelization/__init__.py -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/coca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/coca/__init__.py -------------------------------------------------------------------------------- /tests/models/coca/coca_config.yaml: -------------------------------------------------------------------------------- 1 | prediction_key: logits 2 | vision_embd_prediction_key: vision_embeddings 3 | text_embd_prediction_key: text_embeddings 4 | vision_cls_prediction_key: vision_cls 5 | text_cls_prediction_key: text_cls 6 | vision_encoder_config: 7 | sample_key: images 8 | prediction_key: vision_embeddings 9 | img_size: 224 10 | n_classes: Null # Disable vision transformer head 11 | n_layer: 6 12 | attention_config: 13 | attention_engine_type: pytorch_flash_attention 14 | n_head: 8 15 | n_embd: 768 16 | dropout: 0.0 17 | patch_size: 16 18 | patch_stride: 16 19 | n_img_channels: 3 20 | add_cls_token: False 21 | bias: True 22 | text_decoder_config: 23 | sample_key: input_ids 24 | prediction_key: text_embeddings 25 | block_size: 1024 26 | vocab_size: 50304 27 | n_layer_text: 6 28 | n_layer_multimodal_text: 6 29 | attention_config: 30 | attention_engine_type: pytorch_flash_attention 31 | n_head: 12 32 | ffn_hidden: 3072 33 | n_embd: 768 34 | dropout: 0.0 35 | bias: true 36 | activation: swiglu 37 | epsilon: 1e-5 38 | n_pool_head: 8 39 | n_vision_queries: 256 40 | bias_attn_pool: False 41 | epsilon_attn_pool: 1e-5 -------------------------------------------------------------------------------- /tests/models/coca/test_attention_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from modalities.models.coca.attention_pooling import AttentionPooling 4 | 5 | 6 | def test_attention_pooling_forward(): 7 | model = AttentionPooling(n_embd=768, n_head=8, bias=False, epsilon=1e-5) 8 | dummy_input = torch.randn(1, 256, 768) 9 | dummy_queries = torch.randn(1, 257, 768) 10 | out = model(dummy_queries, dummy_input) 11 | assert out.shape == (1, 257, 768) 12 | -------------------------------------------------------------------------------- /tests/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/models/components/__init__.py -------------------------------------------------------------------------------- /tests/models/components/test_layer_norms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | import torch.nn as nn 5 | 6 | from modalities.models.components.layer_norms import RMSLayerNorm 7 | 8 | 9 | @pytest.fixture 10 | def rms_layer_norm() -> RMSLayerNorm: 11 | norm = RMSLayerNorm(ndim=3, epsilon=1e-6) 12 | weight_tensor = torch.Tensor([1, 2, 3]) 13 | norm.weight = nn.Parameter(weight_tensor) 14 | norm.bias = nn.Parameter(torch.ones(3)) 15 | return norm 16 | 17 | 18 | def test_rms_layer_norm_forward(rms_layer_norm): 19 | x = torch.Tensor([0.1, 0.2, 0.3]) 20 | output = rms_layer_norm(x) 21 | ref_x = x / np.sqrt((0.1**2 + 0.2**2 + 0.3**2) / 3 + 1e-6) 22 | ref_tensor = ref_x * rms_layer_norm.weight + torch.tensor([1, 1, 1]) 23 | 24 | assert output.shape == x.shape 25 | assert all(output == ref_tensor) 26 | -------------------------------------------------------------------------------- /tests/models/test_hf_adapter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapterConfig 6 | 7 | 8 | @pytest.fixture() 9 | def hf_model_adapter_config() -> HFModelAdapterConfig: 10 | return HFModelAdapterConfig(config={}) 11 | 12 | 13 | def test_convert_posixpath_to_str(hf_model_adapter_config: HFModelAdapterConfig): 14 | test_data_to_be_formatted = { 15 | "key1": Path("test/path/1"), 16 | "key2": [ 17 | {"key211": Path("test/path/211"), "key212": 1}, 18 | {"key221": 1, "key222": Path("test/path/222")}, 19 | ], 20 | "key3": 1, 21 | } 22 | expected_result = { 23 | "key1": "test/path/1", 24 | "key2": [ 25 | {"key211": "test/path/211", "key212": 1}, 26 | {"key221": 1, "key222": "test/path/222"}, 27 | ], 28 | "key3": 1, 29 | } 30 | result = hf_model_adapter_config._convert_posixpath_to_str(test_data_to_be_formatted) 31 | assert result == expected_result 32 | -------------------------------------------------------------------------------- /tests/models/test_model_factory.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from modalities.exceptions import ModelStateError 6 | from modalities.models.model_factory import ModelFactory 7 | 8 | 9 | class AllMetaDeviceModel(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | self.linear = nn.Linear(4, 2, device="meta") 13 | self.register_buffer("buffer", torch.empty(1, device="meta")) 14 | 15 | 16 | class AllRealDeviceModel(nn.Module): 17 | def __init__(self): 18 | super().__init__() 19 | self.linear = nn.Linear(4, 2) 20 | self.register_buffer("buffer", torch.empty(1)) 21 | 22 | 23 | class MixedDeviceModel(nn.Module): 24 | def __init__(self): 25 | super().__init__() 26 | self.linear = nn.Linear(4, 2, device="meta") 27 | self.register_buffer("buffer", torch.empty(1)) # Not on meta device 28 | 29 | 30 | def test_is_model_on_meta_device_true(): 31 | model = AllMetaDeviceModel() 32 | assert ModelFactory._is_model_on_meta_device(model) 33 | 34 | 35 | def test_is_model_on_meta_device_false(): 36 | model = AllRealDeviceModel() 37 | assert not ModelFactory._is_model_on_meta_device(model) 38 | 39 | 40 | def test_is_model_on_meta_device_mixed_raises(): 41 | model = MixedDeviceModel() 42 | with pytest.raises(ModelStateError): 43 | ModelFactory._is_model_on_meta_device(model) 44 | -------------------------------------------------------------------------------- /tests/models/vision_transformer/test_vision_transformer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import torch 5 | 6 | from modalities.__main__ import load_app_config_dict 7 | from modalities.models.vision_transformer.vision_transformer_model import VisionTransformer, VisionTransformerConfig 8 | from tests.conftest import _ROOT_DIR 9 | 10 | 11 | def test_vision_transformer(): 12 | # Create model 13 | config_file_path = _ROOT_DIR / Path("tests/models/vision_transformer/vision_transformer_config.yaml") 14 | config_dict = load_app_config_dict(config_file_path=config_file_path) 15 | config = VisionTransformerConfig.model_validate(config_dict) 16 | model = VisionTransformer(**dict(config)) 17 | 18 | # Create dummy inputs 19 | dummy_input_image = torch.randn(1, 3, 224, 224) 20 | dummy_input = dict(images=dummy_input_image) 21 | 22 | # Create optimizer 23 | optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) 24 | 25 | # Run one training step 26 | optimizer.zero_grad() 27 | out = model(dummy_input) 28 | loss = out["logits"].sum() 29 | loss.backward() 30 | optimizer.step() 31 | 32 | # Test outputs 33 | assert "logits" in out 34 | assert out["logits"].shape == (1, 1000) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "img_size,patch_size,patch_stride,add_cls_token,target_block_size", 39 | [ 40 | ((224, 224), 16, 16, True, 197), 41 | ((224, 224), 16, 16, False, 196), 42 | ((224, 112), 16, 16, False, 98), 43 | ((480, 480), 16, 16, False, 900), 44 | ((480 + 1, 480 + 1), 16, 16, False, 900), 45 | ((224, 224), 8, 16, True, 197), 46 | ((224, 224), 16, 8, True, 730), 47 | ((224, 224), 8, 8, True, 785), 48 | ], 49 | ) 50 | def test_vision_transformer_block_size(img_size, patch_size, patch_stride, add_cls_token, target_block_size): 51 | block_size = VisionTransformer._calculate_block_size(img_size, patch_size, patch_stride, add_cls_token) 52 | assert block_size == target_block_size 53 | -------------------------------------------------------------------------------- /tests/models/vision_transformer/vision_transformer_config.yaml: -------------------------------------------------------------------------------- 1 | sample_key: images 2 | prediction_key: logits 3 | img_size: 224 4 | n_classes: 1000 5 | n_layer: 6 6 | n_head: 8 7 | n_embd: 768 8 | dropout: 0.0 9 | patch_size: 16 10 | patch_stride: 16 11 | n_img_channels: 3 12 | add_cls_token: True 13 | bias: True 14 | -------------------------------------------------------------------------------- /tests/nn/test_attention.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from modalities.nn.attention import AttentionType, MultiHeadAttention 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "attention_type", [AttentionType.CAUSAL_SELF_ATTENTION, AttentionType.NON_CAUSAL_SELF_ATTENTION] 9 | ) 10 | def test_attention_forward(attention_type): 11 | model = MultiHeadAttention(n_embd=64, n_head=8, attention_type=attention_type) 12 | dummy_input = torch.randn(1, 256, 64) 13 | out = model(dummy_input) 14 | assert out.shape == (1, 256, 64) 15 | 16 | 17 | def test_attention_with_cross_attention_forward(): 18 | model = MultiHeadAttention(n_embd=64, n_head=8, attention_type=AttentionType.CROSS_ATTENTION) 19 | dummy_input = torch.randn(1, 256, 64) 20 | dummy_context = torch.randn(1, 16, 64) 21 | out = model(dummy_input, context=dummy_context) 22 | assert out.shape == (1, 256, 64) 23 | -------------------------------------------------------------------------------- /tests/nn/test_mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from modalities.models.model import SwiGLU 5 | from modalities.nn.mlp import MLP 6 | 7 | 8 | def test_mlp_forward(): 9 | model = MLP(in_features=64, hidden_features=256) 10 | dummy_input = torch.randn(1, 10, 64) 11 | out = model(dummy_input) 12 | assert out.shape == (1, 10, 64) 13 | 14 | 15 | def test_SwiGLU_forward(): 16 | n_embd = 512 17 | ffn_hidden = 4 * n_embd 18 | bias = True 19 | mlp = SwiGLU(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias) 20 | 21 | hidden_dim = 1536 22 | assert SwiGLU._get_hidden_dim(ffn_hidden=ffn_hidden) == hidden_dim 23 | 24 | n_embd = 511 25 | ffn_hidden = 4 * n_embd 26 | assert SwiGLU._get_hidden_dim(ffn_hidden=ffn_hidden) == hidden_dim 27 | 28 | n_embd = 512 29 | 30 | # batch size x sequence length x embedding dim 31 | input_tensor = torch.randn(1, 1, n_embd) 32 | output_tensor = mlp(input_tensor) 33 | assert output_tensor.shape == (1, 1, n_embd) 34 | 35 | W = nn.Linear(in_features=n_embd, out_features=hidden_dim, bias=bias) 36 | V = nn.Linear(in_features=n_embd, out_features=hidden_dim, bias=bias) 37 | W_2 = nn.Linear(in_features=hidden_dim, out_features=n_embd, bias=bias) 38 | silu = nn.SiLU() 39 | mlp.W = W 40 | mlp.V = V 41 | mlp.W_2 = W_2 42 | 43 | output_tensor = mlp(input_tensor) 44 | assert torch.all(output_tensor == W_2(silu(W(input_tensor)) * V(input_tensor))) 45 | -------------------------------------------------------------------------------- /tests/run_all_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ####################### 4 | ### INPUT ARGUMENTS ### 5 | ####################### 6 | if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist 7 | then 8 | echo "Need to specify 2 GPU devices as arguments, e.g. bash run_distributed_tests.sh 0 1" 9 | exit 10 | fi 11 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]] # if one of the two input arguments is not an integer 0-7 12 | then 13 | echo "Need to specify integers 0-7 as arguments, e.g. bash run_distributed_tests.sh 0 1" 14 | exit 15 | fi 16 | 17 | ################# 18 | ### VARIABLES ### 19 | ################# 20 | DEV0=$1 21 | DEV1=$2 22 | 23 | ############# 24 | ### TESTS ### 25 | ############# 26 | 27 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")") 28 | cd "$SCRIPT_DIR/.." 29 | 30 | 31 | mkdir -p .coverage_reports 32 | rm -f .coverage_reports/* 33 | 34 | COVERAGE_FILE=.coverage_reports/.coverage.part0 python -m pytest tests/ 35 | 36 | sh tests/run_distributed_tests.sh $DEV0 $DEV1 --cov 37 | 38 | # combine test coverage reports 39 | cd .coverage_reports 40 | coverage combine --keep 41 | coverage report 42 | -------------------------------------------------------------------------------- /tests/test_evaluator.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import call 2 | 3 | import torch 4 | 5 | from modalities.batch import DatasetBatch 6 | from modalities.evaluator import Evaluator 7 | 8 | 9 | def test_evaluate_cpu( 10 | monkeypatch, nn_model_mock, loss_mock, llm_data_loader_mock, progress_publisher_mock, set_env_cpu 11 | ): 12 | batch_size = 32 13 | seq_len = 64 14 | num_batches = 4 15 | sample_key = "input_ids" 16 | target_key = "target_ids" 17 | 18 | sample_tensor = torch.randint(size=(batch_size, seq_len), low=1, high=100) 19 | samples = {sample_key: sample_tensor[:, :-1]} 20 | targets = {target_key: sample_tensor[:, 1:]} 21 | 22 | batches = [DatasetBatch(targets=targets, samples=samples) for _ in range(num_batches)] 23 | 24 | llm_data_loader_mock.__iter__ = lambda _: iter(batches) 25 | llm_data_loader_mock.batch_size = batch_size 26 | 27 | evaluator = Evaluator( 28 | progress_publisher=progress_publisher_mock, 29 | evaluation_result_publisher=progress_publisher_mock, 30 | ) 31 | 32 | evaluator.evaluate( 33 | model=nn_model_mock, data_loaders=[llm_data_loader_mock], loss_fun=loss_mock, num_train_steps_done=1 34 | ) 35 | nn_model_mock.assert_has_calls([call(b.samples) for b in batches]) 36 | -------------------------------------------------------------------------------- /tests/test_gym.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import call 2 | 3 | from modalities.gym import Gym 4 | from tests.test_utils import configure_dataloader_mock 5 | 6 | 7 | def test_run_cpu_only( 8 | monkeypatch, 9 | checkpoint_saving_mock, 10 | evaluator_mock, 11 | app_state_mock, 12 | loss_mock, 13 | llm_data_loader_mock, 14 | set_env_cpu, 15 | trainer, 16 | ): 17 | num_batches = 4 18 | num_ranks = 1 19 | 20 | llm_data_loader_mock, batches = configure_dataloader_mock( 21 | batch_size=32, 22 | seq_len=64, 23 | num_batches=num_batches, 24 | sample_key="input_ids", 25 | target_key="target_ids", 26 | llm_data_loader_mock=llm_data_loader_mock, 27 | ) 28 | 29 | gym = Gym(trainer=trainer, evaluator=evaluator_mock, loss_fun=loss_mock, num_ranks=num_ranks) 30 | gym.run( 31 | app_state=app_state_mock, 32 | training_log_interval_in_steps=1, 33 | checkpointing_interval_in_steps=1, 34 | evaluation_interval_in_steps=1, 35 | train_data_loader=llm_data_loader_mock, 36 | evaluation_data_loaders=[], 37 | checkpoint_saving=checkpoint_saving_mock, 38 | ) 39 | app_state_mock.model.assert_has_calls([call(b.samples) for b in batches]) 40 | app_state_mock.optimizer.step.assert_called() 41 | -------------------------------------------------------------------------------- /tests/test_loss_functions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from modalities.batch import InferenceResultBatch 5 | from modalities.loss_functions import NCELoss, nce_loss 6 | 7 | 8 | @pytest.fixture 9 | def dummy_result_batch() -> InferenceResultBatch: 10 | predictions = {"embedding": torch.rand(1024, 512)} 11 | targets = {"target": torch.zeros(1024, 512)} 12 | batch_dim = 1024 13 | result_batch = InferenceResultBatch(targets, predictions, batch_dim) 14 | return result_batch 15 | 16 | 17 | # calculating asymmetric NCELoss between a batch of embeddings and itself --> zero 18 | @pytest.mark.parametrize("key", ["embedding"]) 19 | def test_asymm_NCELoss_is_zero(dummy_result_batch, key): 20 | loss_func = NCELoss(prediction_key1=key, prediction_key2=key) 21 | assert loss_func(dummy_result_batch) <= 10e-6 22 | 23 | 24 | # calculating nce_loss for two randomly generated batch of embeddings (manually calculated) 25 | @pytest.mark.parametrize( 26 | "embedding1,embedding2", 27 | [ 28 | ( 29 | torch.Tensor([[0.38, 0.18], [0.36, 0.66], [0.72, 0.09]]), 30 | torch.Tensor([[0.48, 0.01], [0.54, 0.28], [0.08, 0.34]]), 31 | ) 32 | ], 33 | ) 34 | def test_nce_loss_correctness(embedding1, embedding2): 35 | unidirectional_loss = nce_loss(embedding1, embedding2, device="cpu", is_asymmetric=True, temperature=1.0) 36 | bidirectional_loss = nce_loss(embedding1, embedding2, device="cpu", is_asymmetric=False, temperature=1.0) 37 | assert unidirectional_loss == pytest.approx(1.1300, 0.0001) 38 | assert bidirectional_loss == pytest.approx(2.2577, 0.0001) 39 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch.cuda 3 | 4 | from modalities.__main__ import Main 5 | from modalities.config.config import ProcessGroupBackendType 6 | from modalities.config.instantiation_models import TrainingComponentsInstantiationModel 7 | from modalities.running_env.cuda_env import CudaEnv 8 | 9 | 10 | @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This e2e test requires 1 GPU.") 11 | def test_e2e_training_run_wout_ckpt(monkeypatch, dummy_config, dummy_config_path): 12 | # patch in env variables 13 | monkeypatch.setenv("MASTER_ADDR", "localhost") 14 | monkeypatch.setenv("MASTER_PORT", "9948") 15 | 16 | with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): 17 | main = Main(dummy_config_path) 18 | main.config_dict = dummy_config 19 | components = main.build_components(components_model_type=TrainingComponentsInstantiationModel) 20 | main.run(components) 21 | -------------------------------------------------------------------------------- /tests/test_rotary_qkv_transform.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from modalities.models.gpt2.gpt2_model import RotaryTransform 4 | 5 | 6 | def test_rotary_transform(): 7 | bs = 1 8 | n_heads = 2 9 | embedding_dim = 8 10 | seq_lenght = 2 11 | head_dim = embedding_dim // n_heads 12 | 13 | q = torch.ones(bs, n_heads, seq_lenght, head_dim) + 1 14 | q[:, :, :, head_dim // 2 :] = q[:, :, :, head_dim // 2 :] + 1 15 | k = torch.ones(bs, n_heads, seq_lenght, head_dim) + 2 16 | k[:, :, :, head_dim // 2 :] = k[:, :, :, head_dim // 2 :] + 1 17 | v = torch.ones(bs, n_heads, seq_lenght, head_dim) 18 | 19 | rotary_transform = RotaryTransform(n_embd=embedding_dim, n_head=n_heads) 20 | 21 | q_rot, k_rot, v_rot = rotary_transform(q=q, k=k, v=v) 22 | 23 | assert torch.equal(v, v_rot) 24 | assert v.shape == v_rot.shape 25 | 26 | theta = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim)) 27 | 28 | m = torch.tensor([0, 1]).view(2, 1) 29 | theta_0 = theta[0] 30 | theta_1 = theta[1] 31 | theta = torch.tensor([theta_0, theta_1, theta_0, theta_1]).view(1, 4) 32 | m_theta = m * theta 33 | 34 | cos_m_theta = m_theta.cos() 35 | sin_m_theta = m_theta.sin() 36 | 37 | for comp, comp_rot in zip([q, k], [q_rot, k_rot]): 38 | assert not torch.equal(comp, comp_rot) 39 | assert comp.shape == comp_rot.shape 40 | comp_h_1, comp_h_2 = comp.chunk(2, dim=-1) 41 | comp_rot_h = torch.cat([-comp_h_2, comp_h_1], dim=-1) 42 | comp_rot_expected = comp * cos_m_theta + comp_rot_h * sin_m_theta 43 | assert torch.equal(comp_rot_expected, comp_rot) 44 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import debugpy 4 | import torch 5 | 6 | from modalities.batch import DatasetBatch 7 | from modalities.util import get_local_number_of_trainable_parameters 8 | 9 | 10 | def add_debugger_to_distributed_test(): 11 | """Add a debugger to a distributed test. 12 | This function should be called at the beginning of the test. 13 | 14 | Within VScode you can use the following configuration to attach the debugger to the test: 15 | 16 | ```json 17 | { 18 | "name": "Test Torch Distributed", 19 | "type": "python", 20 | "request": "launch", 21 | "program": "path/to/torchrun", 22 | "console": "integratedTerminal", 23 | "env": {"CUDA_VISIBLE_DEVICES": "0,1"}, 24 | "args": ["--rdzv-endpoint", "localhost:29833", "--nnodes", "1", 25 | "--nproc_per_node", "2", "path/to/pytest", "tests/some_test.py"], 26 | "justMyCode": false, 27 | }, 28 | ``` 29 | """ 30 | # Get the rank of the process (0 or 1 in this case) 31 | rank = int(os.getenv("RANK")) 32 | 33 | # Use a different port for each process 34 | port = 9875 + rank 35 | debugpy.listen(("0.0.0.0", port)) # Listening on all interfaces to allow debugger to attach 36 | print(f"Rank {rank}: Waiting for debugger to attach on port {port}...") 37 | debugpy.wait_for_client() # Pause here until the debugger attaches 38 | 39 | 40 | def configure_dataloader_mock( 41 | batch_size: int, 42 | seq_len: int, 43 | num_batches: int, 44 | sample_key: str, 45 | target_key: str, 46 | llm_data_loader_mock, 47 | ): 48 | sample_tensor = torch.randint(size=(batch_size, seq_len), low=1, high=100) 49 | samples = {sample_key: sample_tensor[:, :-1]} 50 | targets = {target_key: sample_tensor[:, 1:]} 51 | 52 | batches = [DatasetBatch(targets=targets, samples=samples) for _ in range(num_batches)] 53 | 54 | llm_data_loader_mock.__iter__ = lambda _: iter(batches) 55 | llm_data_loader_mock.batch_size = batch_size 56 | llm_data_loader_mock.fast_forward_batch_id = 0 57 | llm_data_loader_mock.__len__ = lambda _: num_batches 58 | 59 | return llm_data_loader_mock, batches 60 | 61 | 62 | def test_get_local_number_of_trainable_parameters(): 63 | # Create a simple model with trainable parameters 64 | model = torch.nn.Sequential(torch.nn.Linear(10, 5), torch.nn.ReLU(), torch.nn.Linear(5, 2)) 65 | 66 | # Calculate the expected number of trainable parameters 67 | expected_params = 10 * 5 + 5 + 5 * 2 + 2 # weights_1 + bias_1 + weights_2 + bias_2 = 67 68 | 69 | # Call the function and check the result 70 | assert get_local_number_of_trainable_parameters(model) == expected_params 71 | -------------------------------------------------------------------------------- /tests/test_yaml_configs/coca_config_initialization.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: model_raw 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: coca 13 | weight_init_type: WILL_BE_REPLACED 14 | mean: 0.0 15 | std: WILL_BE_REPLACED 16 | 17 | model_raw: 18 | component_key: model 19 | variant_key: coca 20 | config: 21 | prediction_key: logits 22 | vision_embd_prediction_key: vision_embeddings 23 | text_embd_prediction_key: text_embeddings 24 | vision_cls_prediction_key: vision_cls 25 | text_cls_prediction_key: text_cls 26 | vision_encoder_config: 27 | sample_key: images 28 | prediction_key: vision_embeddings 29 | img_size: 224 30 | n_classes: Null # Disable vision transformer head 31 | n_layer: 6 32 | attention_config: 33 | attention_engine_type: default_attention 34 | n_head: 8 35 | n_embd: 768 36 | dropout: 0.0 37 | patch_size: 16 38 | patch_stride: 16 39 | n_img_channels: 3 40 | add_cls_token: False 41 | bias: True 42 | text_decoder_config: 43 | sample_key: input_ids 44 | prediction_key: logits 45 | block_size: 1024 46 | vocab_size: 50304 47 | n_layer_text: 6 48 | n_layer_multimodal_text: 6 49 | attention_config: 50 | attention_engine_type: default_attention 51 | n_head: 12 52 | ffn_hidden: 3072 53 | n_embd: 768 54 | dropout: 0.0 55 | bias: true 56 | activation: swiglu 57 | epsilon: 1e-5 58 | n_pool_head: 8 59 | n_vision_queries: 256 60 | bias_attn_pool: False 61 | epsilon_attn_pool: 1e-5 -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_initialization.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: model_raw 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: gpt2 13 | weight_init_type: WILL_BE_REPLACED 14 | mean: 0.0 15 | std: WILL_BE_REPLACED 16 | hidden_dim: ${model_raw.config.n_embd} 17 | num_layers: ${model_raw.config.n_layer} 18 | 19 | model_raw: 20 | component_key: model 21 | variant_key: gpt2 22 | config: 23 | sample_key: "input_ids" 24 | poe_type: ABSOLUTE 25 | prediction_key: "logits" 26 | sequence_length: 2048 27 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 28 | n_layer: 12 29 | n_head_q: 12 30 | n_head_kv: 12 31 | ffn_hidden: 2048 32 | n_embd: 768 33 | dropout: 0.0 34 | bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 35 | attention_config: 36 | qkv_transforms: [] 37 | attention_implementation: manual 38 | activation_type: gelu 39 | attention_norm_config: 40 | norm_type: rms_norm 41 | config: 42 | ndim: ${model_raw.config.n_embd} 43 | bias: true 44 | epsilon: 1e-5 45 | ffn_norm_config: 46 | norm_type: rms_norm 47 | config: 48 | ndim: ${model_raw.config.n_embd} 49 | bias: true 50 | epsilon: 1e-5 51 | lm_head_norm_config: 52 | norm_type: rms_norm 53 | config: 54 | ndim: ${model_raw.config.n_embd} 55 | bias: true 56 | epsilon: 1e-5 57 | use_weight_tying: WILL_BE_REPLACED -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_initialization_fsdp1.yaml: -------------------------------------------------------------------------------- 1 | tested_model: 2 | component_key: model 3 | variant_key: fsdp1_wrapped 4 | config: 5 | model: 6 | instance_key: initialized_model 7 | pass_type: BY_REFERENCE 8 | sync_module_states: true 9 | mixed_precision_settings: BF_16 10 | sharding_strategy: FULL_SHARD 11 | block_names: [GPT2Block] 12 | 13 | initialized_model: 14 | component_key: model 15 | variant_key: model_initialized 16 | config: 17 | model: 18 | instance_key: model_raw 19 | pass_type: BY_REFERENCE 20 | model_initializer: 21 | component_key: model_initialization 22 | variant_key: composed 23 | config: 24 | model_type: gpt2 25 | weight_init_type: WILL_BE_REPLACED 26 | mean: 0.0 27 | std: WILL_BE_REPLACED 28 | hidden_dim: ${model_raw.config.n_embd} 29 | num_layers: ${model_raw.config.n_layer} 30 | 31 | model_raw: 32 | component_key: model 33 | variant_key: gpt2 34 | config: 35 | sample_key: "input_ids" 36 | poe_type: ABSOLUTE 37 | prediction_key: "logits" 38 | sequence_length: 2048 39 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 40 | n_layer: 12 41 | n_head_q: 12 42 | n_head_kv: 12 43 | ffn_hidden: 2048 44 | n_embd: 768 45 | dropout: 0.0 46 | bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 47 | attention_config: 48 | qkv_transforms: [] 49 | attention_implementation: manual 50 | activation_type: gelu 51 | attention_norm_config: 52 | norm_type: rms_norm 53 | config: 54 | ndim: ${model_raw.config.n_embd} 55 | bias: true 56 | epsilon: 1e-5 57 | ffn_norm_config: 58 | norm_type: rms_norm 59 | config: 60 | ndim: ${model_raw.config.n_embd} 61 | bias: true 62 | epsilon: 1e-5 63 | lm_head_norm_config: 64 | norm_type: rms_norm 65 | config: 66 | ndim: ${model_raw.config.n_embd} 67 | bias: true 68 | epsilon: 1e-5 69 | use_weight_tying: WILL_BE_REPLACED -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_initialization_fsdp2.yaml: -------------------------------------------------------------------------------- 1 | tested_model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: fsdp_model 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: gpt2 13 | weight_init_type: WILL_BE_REPLACED 14 | mean: 0.0 15 | std: WILL_BE_REPLACED 16 | hidden_dim: ${model_raw.config.n_embd} 17 | num_layers: ${model_raw.config.n_layer} 18 | 19 | fsdp_model: 20 | component_key: model 21 | variant_key: fsdp2_wrapped 22 | config: 23 | model: 24 | instance_key: model_raw 25 | pass_type: BY_REFERENCE 26 | device_mesh: 27 | instance_key: device_mesh 28 | pass_type: BY_REFERENCE 29 | mixed_precision_settings: 30 | param_dtype: BF_16 31 | reduce_dtype: BF_16 32 | block_names: [GPT2Block] 33 | 34 | model_raw: 35 | component_key: model 36 | variant_key: gpt2 37 | config: 38 | sample_key: "input_ids" 39 | poe_type: ABSOLUTE 40 | prediction_key: "logits" 41 | sequence_length: 2048 42 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 43 | n_layer: 12 44 | n_head_q: 12 45 | n_head_kv: 12 46 | ffn_hidden: 2048 47 | n_embd: 768 48 | dropout: 0.0 49 | bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 50 | attention_config: 51 | qkv_transforms: [] 52 | attention_implementation: manual 53 | activation_type: gelu 54 | attention_norm_config: 55 | norm_type: rms_norm 56 | config: 57 | ndim: ${model_raw.config.n_embd} 58 | bias: true 59 | epsilon: 1e-5 60 | ffn_norm_config: 61 | norm_type: rms_norm 62 | config: 63 | ndim: ${model_raw.config.n_embd} 64 | bias: true 65 | epsilon: 1e-5 66 | lm_head_norm_config: 67 | norm_type: rms_norm 68 | config: 69 | ndim: ${model_raw.config.n_embd} 70 | bias: true 71 | epsilon: 1e-5 72 | use_weight_tying: WILL_BE_REPLACED 73 | use_meta_device: false 74 | 75 | device_mesh: 76 | component_key: device_mesh 77 | variant_key: default 78 | config: 79 | device_type: cuda 80 | data_parallel_replicate_degree: 1 81 | data_parallel_shard_degree: ${cuda_env:WORLD_SIZE} 82 | world_size: ${cuda_env:WORLD_SIZE} -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_mfu_fsdp1.yaml: -------------------------------------------------------------------------------- 1 | test_model: 2 | component_key: model 3 | variant_key: fsdp1_wrapped 4 | config: 5 | model: 6 | instance_key: model_initialized 7 | pass_type: BY_REFERENCE 8 | sync_module_states: true 9 | mixed_precision_settings: BF_16 10 | sharding_strategy: FULL_SHARD 11 | block_names: [GPT2Block] 12 | 13 | model_initialized: 14 | component_key: model 15 | variant_key: model_initialized 16 | config: 17 | model: 18 | instance_key: model_raw 19 | pass_type: BY_REFERENCE 20 | model_initializer: 21 | component_key: model_initialization 22 | variant_key: composed 23 | config: 24 | model_type: gpt2 25 | weight_init_type: scaled 26 | mean: 0.0 27 | std: 0.02 28 | num_layers: ${model_raw.config.n_layer} 29 | 30 | model_raw: 31 | component_key: model 32 | variant_key: gpt2 33 | config: 34 | sample_key: "input_ids" 35 | poe_type: ABSOLUTE 36 | prediction_key: "logits" 37 | sequence_length: 2048 38 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 39 | n_layer: 12 40 | n_head_q: 12 41 | n_head_kv: 12 42 | ffn_hidden: 3072 43 | n_embd: 768 44 | dropout: 0.0 45 | bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 46 | attention_config: 47 | qkv_transforms: [] 48 | attention_implementation: manual 49 | activation_type: gelu 50 | attention_norm_config: 51 | norm_type: rms_norm 52 | config: 53 | ndim: ${model_raw.config.n_embd} 54 | bias: false 55 | epsilon: 1e-5 56 | ffn_norm_config: 57 | norm_type: rms_norm 58 | config: 59 | ndim: ${model_raw.config.n_embd} 60 | bias: false 61 | epsilon: 1e-5 62 | lm_head_norm_config: 63 | norm_type: rms_norm 64 | config: 65 | ndim: ${model_raw.config.n_embd} 66 | bias: false 67 | epsilon: 1e-5 68 | use_weight_tying: true 69 | use_meta_device: false 70 | 71 | mfu_calculator: 72 | component_key: mfu_calculator 73 | variant_key: gpt2 74 | config: 75 | n_layer: ${model_raw.config.n_layer} 76 | sequence_length: ${model_raw.config.sequence_length} 77 | n_embd: ${model_raw.config.n_embd} 78 | world_size: ${cuda_env:WORLD_SIZE} 79 | wrapped_model: 80 | instance_key: test_model 81 | pass_type: BY_REFERENCE -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_mfu_fsdp2.yaml: -------------------------------------------------------------------------------- 1 | test_model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: fsdp_model 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: gpt2 13 | weight_init_type: scaled 14 | mean: 0.0 15 | std: 0.02 16 | num_layers: ${model_raw.config.n_layer} 17 | 18 | fsdp_model: 19 | component_key: model 20 | variant_key: fsdp2_wrapped 21 | config: 22 | model: 23 | instance_key: model_raw 24 | pass_type: BY_REFERENCE 25 | device_mesh: 26 | instance_key: device_mesh 27 | pass_type: BY_REFERENCE 28 | mixed_precision_settings: 29 | param_dtype: BF_16 30 | reduce_dtype: BF_16 31 | block_names: [GPT2Block] 32 | 33 | model_raw: 34 | component_key: model 35 | variant_key: gpt2 36 | config: 37 | sample_key: "input_ids" 38 | poe_type: ABSOLUTE 39 | prediction_key: "logits" 40 | sequence_length: 2048 41 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 42 | n_layer: 12 43 | n_head_q: 12 44 | n_head_kv: 12 45 | ffn_hidden: 3072 46 | n_embd: 768 47 | dropout: 0.0 48 | bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 49 | attention_config: 50 | qkv_transforms: [] 51 | attention_implementation: manual 52 | activation_type: gelu 53 | attention_norm_config: 54 | norm_type: rms_norm 55 | config: 56 | ndim: ${model_raw.config.n_embd} 57 | bias: false 58 | epsilon: 1e-5 59 | ffn_norm_config: 60 | norm_type: rms_norm 61 | config: 62 | ndim: ${model_raw.config.n_embd} 63 | bias: false 64 | epsilon: 1e-5 65 | lm_head_norm_config: 66 | norm_type: rms_norm 67 | config: 68 | ndim: ${model_raw.config.n_embd} 69 | bias: false 70 | epsilon: 1e-5 71 | use_weight_tying: true 72 | use_meta_device: false 73 | 74 | device_mesh: 75 | component_key: device_mesh 76 | variant_key: default 77 | config: 78 | device_type: cuda 79 | data_parallel_replicate_degree: 1 80 | data_parallel_shard_degree: ${cuda_env:WORLD_SIZE} # i.e., fully sharded 81 | world_size: ${cuda_env:WORLD_SIZE} 82 | 83 | mfu_calculator: 84 | component_key: mfu_calculator 85 | variant_key: gpt2 86 | config: 87 | n_layer: ${model_raw.config.n_layer} 88 | sequence_length: ${model_raw.config.sequence_length} 89 | n_embd: ${model_raw.config.n_embd} 90 | world_size: ${cuda_env:WORLD_SIZE} 91 | wrapped_model: 92 | instance_key: test_model 93 | pass_type: BY_REFERENCE -------------------------------------------------------------------------------- /tests/test_yaml_configs/gpt2_config_optimizer.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | component_key: model 3 | variant_key: model_initialized 4 | config: 5 | model: 6 | instance_key: model_raw 7 | pass_type: BY_REFERENCE 8 | model_initializer: 9 | component_key: model_initialization 10 | variant_key: composed 11 | config: 12 | model_type: gpt2 13 | weight_init_type: scaled 14 | mean: 0.0 15 | std: 0.02 16 | num_layers: ${model_raw.config.n_layer} 17 | 18 | model_raw: 19 | component_key: model 20 | variant_key: gpt2 21 | config: 22 | sample_key: "input_ids" 23 | poe_type: ABSOLUTE 24 | prediction_key: "logits" 25 | sequence_length: 2048 26 | vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency 27 | n_layer: 12 28 | n_head_q: 12 29 | n_head_kv: 12 30 | ffn_hidden: 2048 31 | n_embd: 768 32 | dropout: 0.0 33 | bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster 34 | attention_config: 35 | qkv_transforms: [] 36 | attention_implementation: manual 37 | activation_type: gelu 38 | attention_norm_config: 39 | norm_type: rms_norm 40 | config: 41 | ndim: ${model_raw.config.n_embd} 42 | bias: true 43 | epsilon: 1e-5 44 | ffn_norm_config: 45 | norm_type: rms_norm 46 | config: 47 | ndim: ${model_raw.config.n_embd} 48 | bias: true 49 | epsilon: 1e-5 50 | lm_head_norm_config: 51 | norm_type: rms_norm 52 | config: 53 | ndim: ${model_raw.config.n_embd} 54 | bias: true 55 | epsilon: 1e-5 56 | use_weight_tying: true -------------------------------------------------------------------------------- /tests/tmp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/tmp/.gitkeep -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_seeding.py: -------------------------------------------------------------------------------- 1 | from pytest import mark 2 | 3 | from modalities.utils.seeding import calculate_hashed_seed 4 | 5 | 6 | @mark.parametrize( 7 | "input_data, max_seed", 8 | [ 9 | (["a", "b", "c"], 2**32 - 1), 10 | (["d", "e", "f"], 2**32 - 1), 11 | (["g", "hij", "klmnop"], 2**32 - 1), 12 | ( 13 | [ 14 | "5d3b0e03a13dff183d4d77bc258bec18", 15 | "5d3b0e03a13dff183d4d77bc258bec18", 16 | "5d3b0e03a13dff183d4d77bc258bec18", 17 | ], 18 | 2**32 - 1, 19 | ), 20 | ( 21 | [ 22 | "123b0e03a13dff183d4d77bc258bec18", 23 | "456b0e03a13dff183d4d77bc258bec18", 24 | "789b0e03a13dff183d4d77bc258bec18", 25 | ], 26 | 2**32 - 1, 27 | ), 28 | ], 29 | ) 30 | def test_calculate_seed(input_data: list[str], max_seed: int): 31 | seed = calculate_hashed_seed(input_data=input_data, max_seed=max_seed) 32 | print(seed) 33 | assert seed >= 0 34 | assert seed < max_seed 35 | -------------------------------------------------------------------------------- /tutorials/getting_started/checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/getting_started/checkpoints/.gitkeep -------------------------------------------------------------------------------- /tutorials/getting_started/configs/example_conversion_config_template.yaml: -------------------------------------------------------------------------------- 1 | tokenizer: 2 | component_key: tokenizer 3 | variant_key: pretrained_hf_tokenizer 4 | config: 5 | pretrained_model_name_or_path: tokenizer 6 | padding: false 7 | truncation: false 8 | 9 | checkpointed_model: 10 | component_key: model 11 | variant_key: fsdp1_checkpointed 12 | config: 13 | checkpoint_loading: 14 | component_key: checkpoint_loading 15 | variant_key: torch 16 | config: 17 | device: cpu 18 | precision: BF16 19 | model: 20 | instance_key: model 21 | pass_type: BY_REFERENCE 22 | checkpoint_path: -------------------------------------------------------------------------------- /tutorials/getting_started/configs/example_dataset_config_test.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: data/raw/redpajama_v2_samples_512_test.jsonl 3 | dst_path: data/mem_map/redpajama_v2_samples_512_test.pbin 4 | index_path: data/mem_map/redpajama_v2_samples_512_test.idx 5 | jq_pattern: .raw_content 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: <|endoftext|> 8 | processing_batch_size: 1000 9 | raw_samples_queue_size: 300 10 | processed_samples_queue_size: 300 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_hf_tokenizer 15 | config: 16 | pretrained_model_name_or_path: tokenizer 17 | padding: false 18 | truncation: false 19 | -------------------------------------------------------------------------------- /tutorials/getting_started/configs/example_dataset_config_train.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: data/raw/redpajama_v2_samples_512_train.jsonl 3 | dst_path: data/mem_map/redpajama_v2_samples_512_train.pbin 4 | index_path: data/mem_map/redpajama_v2_samples_512_train.idx 5 | jq_pattern: .raw_content 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: <|endoftext|> 8 | processing_batch_size: 1000 9 | raw_samples_queue_size: 300 10 | processed_samples_queue_size: 300 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_hf_tokenizer 15 | config: 16 | pretrained_model_name_or_path: tokenizer 17 | padding: false 18 | truncation: false 19 | -------------------------------------------------------------------------------- /tutorials/getting_started/data/mem_map/.git_keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/getting_started/data/mem_map/.git_keep -------------------------------------------------------------------------------- /tutorials/getting_started/scripts/run_checkpoint_conversion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | # --------------------------------------------- 5 | # bash run_checkpoint_conversion 6 | # --------------------------------------------- 7 | 8 | ####################### 9 | ### INPUT ARGUMENTS ### 10 | ####################### 11 | if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist 12 | then 13 | echo "Need to specify arguments, e.g. bash run_checkpoint_conversion modalities_config output_dir" 14 | exit 15 | fi 16 | 17 | ############# 18 | ### RUN ##### 19 | ############# 20 | echo "> run checkpoint conversion" 21 | echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2 "--num_testruns 5" 22 | python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2 --num_testruns 5 23 | -------------------------------------------------------------------------------- /tutorials/getting_started/scripts/run_getting_started_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | # --------------------------------------------- 5 | # bash run_getting_started_example.sh 0 1 6 | # (can only be run on 2 GPUs using this script) 7 | # --------------------------------------------- 8 | 9 | ####################### 10 | ### INPUT ARGUMENTS ### 11 | ####################### 12 | if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist 13 | then 14 | echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1" 15 | exit 16 | fi 17 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]] # if one of the two input arguments is not an integer 0-7 18 | then 19 | echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1" 20 | exit 21 | fi 22 | 23 | CUDA_VISIBLE_DEVICES="$1,$2" 24 | 25 | ############# 26 | ### RUN ##### 27 | ############# 28 | echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES 29 | 30 | modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl 31 | modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl 32 | modalities data pack_encoded_data configs/example_dataset_config_train.yaml 33 | modalities data pack_encoded_data configs/example_dataset_config_test.yaml 34 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path configs/example_config.yaml 35 | -------------------------------------------------------------------------------- /tutorials/getting_started/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "50256": { 5 | "content": "<|endoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | } 12 | }, 13 | "bos_token": "<|endoftext|>", 14 | "clean_up_tokenization_spaces": true, 15 | "eos_token": "<|endoftext|>", 16 | "model_max_length": 1024, 17 | "tokenizer_class": "GPT2Tokenizer", 18 | "unk_token": "<|endoftext|>" 19 | } 20 | -------------------------------------------------------------------------------- /tutorials/library_usage/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import torch 5 | from pydantic import BaseModel 6 | 7 | from modalities.__main__ import Main 8 | from modalities.batch import DatasetBatch 9 | from modalities.config.config import ProcessGroupBackendType 10 | from modalities.config.instantiation_models import TrainingComponentsInstantiationModel 11 | from modalities.models.gpt2.collator import CollateFnIF 12 | from modalities.running_env.cuda_env import CudaEnv 13 | 14 | 15 | class CustomGPT2LLMCollateFnConfig(BaseModel): 16 | sample_key: str 17 | target_key: str 18 | custom_attribute: str 19 | 20 | 21 | class CustomGPT2LLMCollateFn(CollateFnIF): 22 | def __init__(self, sample_key: str, target_key: str, custom_attribute: str): 23 | self.sample_key = sample_key 24 | self.target_key = target_key 25 | self.custom_attribute = custom_attribute 26 | self._num_calls = 0 27 | 28 | @property 29 | def num_calls(self) -> int: 30 | return self._num_calls 31 | 32 | def __call__(self, batch: list[list[int]]) -> DatasetBatch: 33 | sample_tensor = torch.tensor(batch) 34 | samples = {self.sample_key: sample_tensor[:, :-1]} 35 | targets = {self.target_key: sample_tensor[:, 1:]} 36 | self._num_calls += 1 37 | return DatasetBatch(targets=targets, samples=samples) 38 | 39 | 40 | def main(): 41 | # load and parse the config file 42 | cwd = Path(__file__).parent 43 | # change to cwd 44 | os.chdir(cwd) 45 | config_file_path = cwd / Path("config_lorem_ipsum.yaml") 46 | 47 | with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): 48 | # instantiate the Main entrypoint of modalities by passing in the config path 49 | modalities_main = Main(config_path=config_file_path) 50 | 51 | # add the custom component to modalities 52 | modalities_main.add_custom_component( 53 | component_key="collate_fn", 54 | variant_key="custom_gpt_2_llm_collator", 55 | custom_component=CustomGPT2LLMCollateFn, 56 | custom_config=CustomGPT2LLMCollateFnConfig, 57 | ) 58 | # run the experiment 59 | components: TrainingComponentsInstantiationModel = modalities_main.build_components( 60 | components_model_type=TrainingComponentsInstantiationModel 61 | ) 62 | modalities_main.run(components) 63 | 64 | collate_fn = components.train_dataloader.collate_fn 65 | if collate_fn.num_calls < 1: 66 | raise ValueError("Custom collator was not called during training.") 67 | print(f"Custom collator was called {collate_fn.num_calls} times during training.") 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /tutorials/library_usage/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 main.py -------------------------------------------------------------------------------- /tutorials/library_usage/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "50256": { 5 | "content": "<|endoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | } 12 | }, 13 | "bos_token": "<|endoftext|>", 14 | "clean_up_tokenization_spaces": true, 15 | "eos_token": "<|endoftext|>", 16 | "model_max_length": 1024, 17 | "tokenizer_class": "GPT2Tokenizer", 18 | "unk_token": "<|endoftext|>" 19 | } 20 | -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/README.md: -------------------------------------------------------------------------------- 1 | # Getting started with Modalities in 15 minutes 2 | 3 | Throughout the tutorial, we will use the Jupyter Notebook `modalities_demo.ipynb` to guide us through the process of getting started with Modalities. The notebook is located in the root directory of the tutorial, along with the `configs` and `data` directories. The `configs` directory contains configuration files for the model pretraining and tokenization, while the `data` directory contains subdirectories for storing checkpoints, preprocessed data, raw data, and tokenizer-related files. 4 | 5 | ```text 6 | └── getting_started_15mins # Root directory for the tutorial 7 | ├── modalities_demo.ipynb # Jupyter Notebook which we will be using for the tutorial. 8 | ├── configs 9 | │ ├── pretraining_config.yaml # Config file for the model pretraining 10 | │ └── tokenization_config.yaml # Config file for tokenization 11 | └── data 12 | ├── checkpoints # Dir where model and optimizer checkpoints are stored. 13 | │ └── 14 | ├── preprocessed # Dir containing preprocessed training and evaluation data. 15 | │ └── 16 | ├── raw 17 | │ └── fineweb_edu_num_docs_483606.jsonl # JSONL file containing raw data for training and evaluation. 18 | └── tokenizer 19 | ├── tokenizer.json # JSON file defining the tokenizer model, including token mappings. 20 | └── tokenizer_config.json # Config file specifying all tokenizer settings 21 | ``` 22 | 23 | 24 | To start the tutorial check out the Jupyter Notebook `modalities_demo.ipynb` and follow the instructions provided in the notebook. 25 | If you do not have Jupyter Notebook installed in your python environment yet, you can install it by running the following command: 26 | 27 | ```bash 28 | pip install jupyterlab 29 | ``` -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/configs/tokenization_config.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: data/raw/fineweb_edu_num_docs_483606.jsonl 3 | dst_path: data/preprocessed/fineweb_edu_num_docs_483606.pbin 4 | index_path: data/preprocessed/fineweb_edu_num_docs_483606.idx 5 | jq_pattern: .text 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: <|endoftext|> 8 | processing_batch_size: 10 9 | raw_samples_queue_size: 300 10 | processed_samples_queue_size: 300 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_hf_tokenizer 15 | config: 16 | pretrained_model_name_or_path: data/tokenizer 17 | padding: false 18 | truncation: false -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/data/checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/checkpoints/.gitkeep -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/data/preprocessed/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/preprocessed/.gitkeep -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/data/raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/data/raw/.gitkeep -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/data/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "added_tokens_decoder": { 4 | "50256": { 5 | "content": "<|endoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false, 10 | "special": true 11 | } 12 | }, 13 | "bos_token": "<|endoftext|>", 14 | "clean_up_tokenization_spaces": true, 15 | "eos_token": "<|endoftext|>", 16 | "model_max_length": 1024, 17 | "tokenizer_class": "GPT2Tokenizer", 18 | "unk_token": "<|endoftext|>" 19 | } 20 | -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/res/banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/res/banner.jpg -------------------------------------------------------------------------------- /tutorials/modalities_in_15_mins/res/notebooks_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Modalities/modalities/8270a7a28f819a2159c2a53b060faced61d0fe75/tutorials/modalities_in_15_mins/res/notebooks_1.png -------------------------------------------------------------------------------- /tutorials/warmstart/README.md: -------------------------------------------------------------------------------- 1 | # Warmstart Tutorial 2 | 3 | In this tutorial, we demonstrate how you can continue the training from a checkpoint, e.g., after the training was interrupted or had crashed. 4 | 5 | ## Prerequisites 6 | We will use the data from the [Modalities in 15 mins Tutorial](../modalities_in_15_mins/modalities_demo.ipynb). 7 | If you haven't already, please run the data generation part of the notebook to generate the data. 8 | 9 | 10 | # Running and warmstarting the model training 11 | 12 | To train the model, we will execute the configuration file `pretrain.yaml` stored in folder `config`, as follows: 13 | 14 | ```bash 15 | CUDA_VISIBLE_DEVICES="5,6" torchrun \ 16 | --rdzv-endpoint localhost:29516 \ 17 | --nnodes 1 \ 18 | --nproc_per_node 2 \ 19 | $(which modalities) run \ 20 | --config_file_path configs/pre_training_config.yaml 21 | ``` 22 | 23 | 24 | We will interrupt the training manually (e.g., CTRL + C) after the 250 steps checkpoint has been written out to `data/checkpoints/`. 25 | 26 | To continue the training from the checkpoint, we will execute the configuration file `warmstart.yaml` stored in folder `config`, running the command below. 27 | Note, that we have to change the paths under `warmstart_checkpoint_paths` in `warmstart.yaml` such that it points to the correct model and optimizer checkpoint files. 28 | 29 | ```bash 30 | CUDA_VISIBLE_DEVICES="5,6" torchrun \ 31 | --rdzv-endpoint localhost:29516 \ 32 | --nnodes 1 \ 33 | --nproc_per_node 2 \ 34 | $(which modalities) run \ 35 | --config_file_path configs/warmstart.yaml 36 | ``` 37 | 38 | 39 | Note that warmstarts do not require you to run the training on the exact same hardware. You can adapt the number of GPUs, number of tokens per batch, etc. in the command line arguments and in the configuration file. 40 | However, the training result is most likely not exactly the same as if you had continued the training on the same hardware. 41 | 42 | We specify consistency checks in the configuration file, such as 43 | ```yaml 44 | consistency_enforcement: 45 | enforce_tokens_per_step_consistency: true 46 | enforce_last_step_logged: false 47 | enforce_last_step_evaluated: false 48 | enforce_last_step_checkpointed: false 49 | ``` 50 | which can be relaxed to only print warnings instead of raising exceptions. 51 | 52 | -------------------------------------------------------------------------------- /tutorials/warmstart/configs/tokenization_config_train.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | src_path: ../../getting_started/data/raw/redpajama_v2_samples_512_train.jsonl 3 | dst_path: ../data/mem_map/redpajama_v2_samples_512_train.pbin 4 | index_path: ../data/mem_map/redpajama_v2_samples_512_train.idx 5 | jq_pattern: .raw_content 6 | num_cpus: ${node_env:num_cpus} 7 | eod_token: <|endoftext|> 8 | processing_batch_size: 1000 9 | raw_samples_queue_size: 300 10 | processed_samples_queue_size: 300 11 | 12 | tokenizer: 13 | component_key: tokenizer 14 | variant_key: pretrained_hf_tokenizer 15 | config: 16 | pretrained_model_name_or_path: ../../getting_started/tokenizer 17 | padding: false 18 | truncation: false 19 | -------------------------------------------------------------------------------- /tutorials/warmstart/scripts/check_checkpoint_consistency.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import re 4 | from pathlib import Path 5 | 6 | 7 | def _get_checkpoint_file_name_without_eid(checkpoint_file_name: str) -> str: 8 | # Remove the experiment id from the checkpoint file name 9 | return re.sub(r"^eid_\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}_[a-f0-9]+-", "", checkpoint_file_name) 10 | 11 | 12 | def test_checkpoint_files_exist(checkpoint_folder_path: list[Path], expected_checkpoint_names: list[str]): 13 | # Check if all the checkpoint files exist and have the correct names 14 | checkpoint_paths = glob.glob(str(checkpoint_folder_path / "**/*"), recursive=True) 15 | 16 | assert len(checkpoint_paths) == 17, "ERROR! Expected 6 checkpoint files." 17 | 18 | assert len([p for p in checkpoint_paths if p.endswith(".distcp")]), "ERROR! Expected 6 checkpoint files." 19 | 20 | 21 | if __name__ == "__main__": 22 | current_file_path = Path(__file__).resolve() 23 | os.chdir(current_file_path.parent) 24 | 25 | checkpoint_folder_path = Path("../data/checkpoints") 26 | 27 | expected_checkpoint_folder_names = [ 28 | # pretrain checkpoint 29 | "seen_steps_11-seen_tokens_45056-target_steps_20-target_tokens_81920", 30 | # warmstart checkpoints 31 | "seen_steps_15-seen_tokens_61440-target_steps_20-target_tokens_81920", 32 | "seen_steps_20-seen_tokens_81920-target_steps_20-target_tokens_81920", 33 | ] 34 | 35 | test_checkpoint_files_exist(checkpoint_folder_path, expected_checkpoint_folder_names) 36 | -------------------------------------------------------------------------------- /tutorials/warmstart/scripts/pre_train_and_warmstart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -ex 3 | 4 | 5 | # --------------------------------------------- 6 | # sh pre_train_and_warmstart.sh 0 1 7 | # (can only be run on 2 GPUs using this script) 8 | # --------------------------------------------- 9 | 10 | ####################### 11 | ### INPUT ARGUMENTS ### 12 | ####################### 13 | if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist 14 | then 15 | echo "Need to specify 2 GPU devices as arguments, e.g. sh pre_train_and_warmstart.sh 0 1" 16 | exit 17 | fi 18 | if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]] # if one of the two input arguments is not an integer 0-7 19 | then 20 | echo "Need to specify integers 0-7 as arguments, e.g. sh pre_train_and_warmstart.sh 0 1" 21 | exit 22 | fi 23 | 24 | CUDA_VISIBLE_DEVICES="$1,$2" 25 | 26 | 27 | 28 | echo "> run warmstart example on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES 29 | 30 | # cd to the directory of the script (absolute path) 31 | cd "$(dirname "$0")" 32 | 33 | rm -rf ../data/ 34 | 35 | 36 | # run preprocessing 37 | modalities data create_raw_index --index_path ../data/mem_map/redpajama_v2_samples_512_train.idx ../../getting_started/data/raw/redpajama_v2_samples_512_train.jsonl 38 | modalities data pack_encoded_data ../configs/tokenization_config_train.yaml 39 | 40 | # run pretraining 41 | 42 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path ../configs/pre_training_config.yaml 43 | 44 | # run warmstart 45 | checkpoint_path=$(find ../data/checkpoints -name "last_checkpoint_info.json" -exec realpath {} \;) 46 | CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29504 --nnodes 1 --nproc_per_node 2 $(which modalities) warmstart --config_file_path ../configs/warmstart_config.yaml --last_checkpoint_info_file_path $checkpoint_path 47 | 48 | # add some consistency checks 49 | python check_checkpoint_consistency.py 50 | 51 | rm -rf ../data/ 52 | 53 | echo "Finished warmstart example" 54 | --------------------------------------------------------------------------------