├── .cursor └── rules │ ├── performance-optimization.mdc │ ├── philosophy.mdc │ ├── pipeline-parallelism.mdc │ ├── project-overview.mdc │ ├── tensor-parallelism.mdc │ └── troubleshooting.mdc ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── 3d_parallelism_unit_tests.yaml │ ├── code_quality.yaml │ ├── fa2_unit_tests.yaml │ ├── pr-rules.yaml │ ├── python-release.yml │ └── trufflehog.yml ├── .gitignore ├── .pre-commit-config-check.yaml ├── .pre-commit-config.yaml ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── docs ├── 3d_parallelism.md ├── benchmark_summary.svg ├── cuda_event_timing.md ├── debugging.md ├── docs.md ├── image-2.png ├── image.png ├── multi-node-training.md ├── nanoset.md └── your-first-training.md ├── examples ├── __init__.py ├── bench_llama_7b.py ├── config_nanoset.yaml ├── config_qwen.py ├── config_qwen.yaml ├── config_qwen_with_moe.yaml ├── config_resume_training.py ├── config_resume_training.yaml ├── config_tiny_llama.py ├── config_tiny_llama.yaml ├── config_tiny_llama_with_s3_upload.yaml ├── contributor-guide │ ├── README.md │ ├── assets │ │ ├── 1.png │ │ ├── 10.png │ │ ├── 11.png │ │ ├── 2.png │ │ ├── 3.png │ │ ├── 4.png │ │ ├── 5.png │ │ ├── 6.png │ │ ├── 7.png │ │ ├── 8.png │ │ └── 9.png │ ├── debug_config_tiny_llama.py │ ├── debug_config_tiny_llama.yaml │ └── debug_tiny_llama.sh ├── custom-dataloader │ ├── README.md │ ├── config_custom_dl.yaml │ └── run_train.py ├── doremi │ ├── README.md │ ├── __init__.py │ ├── assets │ │ ├── domain_weights.png │ │ ├── not_outperform.png │ │ └── outperform.png │ ├── configs │ │ ├── config_2.8b_llama.yaml │ │ ├── config_2.8b_llama_with_tuned_weights.yaml │ │ ├── config_280m_llama.yaml │ │ └── config_280m_llama_proxy.yaml │ ├── doremi │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dataloader.py │ │ ├── doremi_context.py │ │ ├── llama.py │ │ ├── loss.py │ │ ├── trainer.py │ │ └── utils.py │ ├── requirements.txt │ ├── tests │ │ ├── test_doremi_context.py │ │ ├── test_doremi_dataloader.py │ │ ├── test_doremi_loss.py │ │ ├── test_doremi_sampler.py │ │ ├── test_doremi_utils.py │ │ └── utils.py │ ├── train_doremi.py │ ├── train_reference.py │ └── utils.py ├── inference │ └── qwen_moe │ │ ├── README.md │ │ └── convert.py ├── llama │ ├── README.md │ ├── __init__.py │ ├── convert_hf_to_nanotron.py │ ├── convert_nanotron_to_hf.py │ ├── convert_weights.py │ ├── requirements.txt │ └── tests │ │ ├── test_conversion.py │ │ ├── test_conversion.py.orig │ │ └── utils.py ├── mamba │ ├── README.md │ ├── assets │ │ └── loss_mamba.png │ ├── config.py │ ├── config_mamba.yaml │ ├── convert_hf_to_nanotron.py │ ├── convert_nanotron_to_hf.py │ ├── create_config_mamba.py │ ├── mamba.py │ ├── requirements.txt │ ├── run_generate.py │ ├── run_multinode.sh │ ├── selective_scan_interface.py │ ├── train_mamba.py │ ├── train_mamba.sh │ └── trainer.py ├── moe │ ├── README.md │ ├── config_llamoe.py │ ├── config_llamoe.yaml │ ├── llamoe.py │ ├── moe.py │ ├── requirements.txt │ └── train_moe.py ├── mup │ ├── README.md │ ├── assets │ │ ├── llama.png │ │ ├── scale-across-depth.png │ │ └── scale-across-width.png │ └── configs │ │ ├── mup_350m_llama_config.yaml │ │ └── sp_350m_llama_config.yaml └── train_tiny_llama.sh ├── pyproject.toml ├── run_evals.py ├── run_generate.py ├── run_train.py ├── scripts ├── fix_checkpoint_bad_naming.py ├── log_lighteval_to_wandb.py ├── scaling_benchmarks.py └── weka.py ├── slurm_launcher.py ├── src └── nanotron │ ├── __init__.py │ ├── config │ ├── __init__.py │ ├── config.py │ ├── lighteval_config.py │ ├── models_config.py │ ├── parallelism_config.py │ └── utils_config.py │ ├── constants.py │ ├── data │ ├── __init__.py │ ├── clm_collator.py │ ├── dataloader.py │ ├── dataloader_builder.py │ ├── nanoset.py │ ├── nemo_dataset │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── blendable_dataset.py │ │ ├── dataset_utils.py │ │ ├── helpers.cpp │ │ └── indexed_dataset.py │ ├── processing.py │ ├── s3_utils.py │ ├── samplers.py │ ├── sft_processing.py │ ├── tokenized_bytes.py │ └── utils.py │ ├── distributed.py │ ├── eval │ ├── README.md │ ├── __init__.py │ ├── evaluation_tasks.py │ ├── one_job_runner.py │ └── upload_to_wandb.py │ ├── fp8 │ ├── __init__.py │ ├── constants.py │ ├── dtypes.py │ ├── kernel.py │ ├── linear.py │ ├── meta.py │ ├── parameter.py │ ├── tensor.py │ └── utils.py │ ├── generation │ ├── __init__.py │ ├── decode.py │ ├── generate_store.py │ └── sampler.py │ ├── helpers.py │ ├── logging │ ├── __init__.py │ ├── base.py │ ├── logmixin.py │ └── timers.py │ ├── metrics_logging.py │ ├── models │ ├── __init__.py │ ├── base.py │ ├── llama.py │ ├── qwen.py │ └── starcoder2.py │ ├── nn │ ├── __init__.py │ ├── activations.py │ ├── attention.py │ ├── flex_attention.py │ ├── layer_norm.py │ ├── llama3_ring_attention.py │ ├── moe.py │ ├── ring_attention.py │ ├── ring_attention_lucidrain.py │ └── rotary.py │ ├── optim │ ├── __init__.py │ ├── base.py │ ├── clip_grads.py │ ├── gradient_accumulator.py │ ├── inherit_from_other_optimizer.py │ ├── named_optimizer.py │ ├── optimizer_from_gradient_accumulator.py │ └── zero.py │ ├── parallel │ ├── __init__.py │ ├── context.py │ ├── data_parallel │ │ └── utils.py │ ├── parameters.py │ ├── pipeline_parallel │ │ ├── README.md │ │ ├── __init__.py │ │ ├── block.py │ │ ├── context_manager.py │ │ ├── engine.py │ │ ├── functional.py │ │ ├── p2p.py │ │ ├── state.py │ │ ├── tensor_pointer.py │ │ └── utils.py │ ├── sharded_parameters.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── distributed_differentiable_primitives.py │ │ ├── enum.py │ │ ├── functional.py │ │ └── nn.py │ ├── tied_parameters.py │ └── utils.py │ ├── random.py │ ├── s3_checkpoints │ ├── __init__.py │ ├── fsspec.py │ └── s3_mover.py │ ├── sanity_checks.py │ ├── scaling │ └── parametrization.py │ ├── serialize │ ├── __init__.py │ ├── main.py │ ├── metadata.py │ ├── optimizer.py │ ├── random.py │ ├── utils.py │ └── weights.py │ ├── trainer.py │ └── utils.py ├── test_timer_decorator.py ├── tests ├── fp8 │ ├── test_fp8_parameter.py │ ├── test_linear.py │ └── test_tensor.py ├── helpers │ ├── context.py │ ├── data.py │ ├── distributed_tensor.py │ ├── dummy.py │ ├── exception.py │ ├── llama_helper.py │ ├── qwen_helper.py │ └── utils.py ├── kernels │ ├── run_layer_norm_convergence.py │ └── test_layer_norm.py ├── nanoset │ └── test_build_nanoset_dataloader.py ├── pytest.ini ├── test_base_model.py ├── test_checkpointing.py ├── test_clip_grads.py ├── test_data_parallel.py ├── test_distributed.py ├── test_modeling.py ├── test_moe.py ├── test_optimizer.py ├── test_optimizer_params_groups.py ├── test_p2p.py ├── test_parameter.py ├── test_parameters_accumulate_gradient_in_fp32.py ├── test_parametrization.py ├── test_pipeline_parallel.py ├── test_random_state.py ├── test_serialize.py ├── test_sft.py ├── test_tensor_parallel.py ├── test_tie_weights.py └── test_zero.py └── tools └── preprocess_data.py /.cursor/rules/performance-optimization.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/performance-optimization.mdc -------------------------------------------------------------------------------- /.cursor/rules/philosophy.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/philosophy.mdc -------------------------------------------------------------------------------- /.cursor/rules/pipeline-parallelism.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/pipeline-parallelism.mdc -------------------------------------------------------------------------------- /.cursor/rules/project-overview.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/project-overview.mdc -------------------------------------------------------------------------------- /.cursor/rules/tensor-parallelism.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/tensor-parallelism.mdc -------------------------------------------------------------------------------- /.cursor/rules/troubleshooting.mdc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.cursor/rules/troubleshooting.mdc -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/workflows/3d_parallelism_unit_tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/3d_parallelism_unit_tests.yaml -------------------------------------------------------------------------------- /.github/workflows/code_quality.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/code_quality.yaml -------------------------------------------------------------------------------- /.github/workflows/fa2_unit_tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/fa2_unit_tests.yaml -------------------------------------------------------------------------------- /.github/workflows/pr-rules.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/pr-rules.yaml -------------------------------------------------------------------------------- /.github/workflows/python-release.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/python-release.yml -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.github/workflows/trufflehog.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config-check.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.pre-commit-config-check.yaml -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/.pylintrc -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/README.md -------------------------------------------------------------------------------- /docs/3d_parallelism.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/3d_parallelism.md -------------------------------------------------------------------------------- /docs/benchmark_summary.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/benchmark_summary.svg -------------------------------------------------------------------------------- /docs/cuda_event_timing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/cuda_event_timing.md -------------------------------------------------------------------------------- /docs/debugging.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/debugging.md -------------------------------------------------------------------------------- /docs/docs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/docs.md -------------------------------------------------------------------------------- /docs/image-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/image-2.png -------------------------------------------------------------------------------- /docs/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/image.png -------------------------------------------------------------------------------- /docs/multi-node-training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/multi-node-training.md -------------------------------------------------------------------------------- /docs/nanoset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/nanoset.md -------------------------------------------------------------------------------- /docs/your-first-training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/docs/your-first-training.md -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Make examples directory a proper Python package -------------------------------------------------------------------------------- /examples/bench_llama_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/bench_llama_7b.py -------------------------------------------------------------------------------- /examples/config_nanoset.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_nanoset.yaml -------------------------------------------------------------------------------- /examples/config_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_qwen.py -------------------------------------------------------------------------------- /examples/config_qwen.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_qwen.yaml -------------------------------------------------------------------------------- /examples/config_qwen_with_moe.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_qwen_with_moe.yaml -------------------------------------------------------------------------------- /examples/config_resume_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_resume_training.py -------------------------------------------------------------------------------- /examples/config_resume_training.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_resume_training.yaml -------------------------------------------------------------------------------- /examples/config_tiny_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_tiny_llama.py -------------------------------------------------------------------------------- /examples/config_tiny_llama.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_tiny_llama.yaml -------------------------------------------------------------------------------- /examples/config_tiny_llama_with_s3_upload.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/config_tiny_llama_with_s3_upload.yaml -------------------------------------------------------------------------------- /examples/contributor-guide/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/README.md -------------------------------------------------------------------------------- /examples/contributor-guide/assets/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/1.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/10.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/11.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/2.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/3.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/4.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/5.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/6.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/7.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/8.png -------------------------------------------------------------------------------- /examples/contributor-guide/assets/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/assets/9.png -------------------------------------------------------------------------------- /examples/contributor-guide/debug_config_tiny_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/debug_config_tiny_llama.py -------------------------------------------------------------------------------- /examples/contributor-guide/debug_config_tiny_llama.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/debug_config_tiny_llama.yaml -------------------------------------------------------------------------------- /examples/contributor-guide/debug_tiny_llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/contributor-guide/debug_tiny_llama.sh -------------------------------------------------------------------------------- /examples/custom-dataloader/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/custom-dataloader/README.md -------------------------------------------------------------------------------- /examples/custom-dataloader/config_custom_dl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/custom-dataloader/config_custom_dl.yaml -------------------------------------------------------------------------------- /examples/custom-dataloader/run_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/custom-dataloader/run_train.py -------------------------------------------------------------------------------- /examples/doremi/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/README.md -------------------------------------------------------------------------------- /examples/doremi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/doremi/assets/domain_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/assets/domain_weights.png -------------------------------------------------------------------------------- /examples/doremi/assets/not_outperform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/assets/not_outperform.png -------------------------------------------------------------------------------- /examples/doremi/assets/outperform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/assets/outperform.png -------------------------------------------------------------------------------- /examples/doremi/configs/config_2.8b_llama.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/configs/config_2.8b_llama.yaml -------------------------------------------------------------------------------- /examples/doremi/configs/config_2.8b_llama_with_tuned_weights.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/configs/config_2.8b_llama_with_tuned_weights.yaml -------------------------------------------------------------------------------- /examples/doremi/configs/config_280m_llama.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/configs/config_280m_llama.yaml -------------------------------------------------------------------------------- /examples/doremi/configs/config_280m_llama_proxy.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/configs/config_280m_llama_proxy.yaml -------------------------------------------------------------------------------- /examples/doremi/doremi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/doremi/doremi/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/config.py -------------------------------------------------------------------------------- /examples/doremi/doremi/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/dataloader.py -------------------------------------------------------------------------------- /examples/doremi/doremi/doremi_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/doremi_context.py -------------------------------------------------------------------------------- /examples/doremi/doremi/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/llama.py -------------------------------------------------------------------------------- /examples/doremi/doremi/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/loss.py -------------------------------------------------------------------------------- /examples/doremi/doremi/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/trainer.py -------------------------------------------------------------------------------- /examples/doremi/doremi/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/doremi/utils.py -------------------------------------------------------------------------------- /examples/doremi/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | -------------------------------------------------------------------------------- /examples/doremi/tests/test_doremi_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/test_doremi_context.py -------------------------------------------------------------------------------- /examples/doremi/tests/test_doremi_dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/test_doremi_dataloader.py -------------------------------------------------------------------------------- /examples/doremi/tests/test_doremi_loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/test_doremi_loss.py -------------------------------------------------------------------------------- /examples/doremi/tests/test_doremi_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/test_doremi_sampler.py -------------------------------------------------------------------------------- /examples/doremi/tests/test_doremi_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/test_doremi_utils.py -------------------------------------------------------------------------------- /examples/doremi/tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/tests/utils.py -------------------------------------------------------------------------------- /examples/doremi/train_doremi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/train_doremi.py -------------------------------------------------------------------------------- /examples/doremi/train_reference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/train_reference.py -------------------------------------------------------------------------------- /examples/doremi/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/doremi/utils.py -------------------------------------------------------------------------------- /examples/inference/qwen_moe/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/inference/qwen_moe/README.md -------------------------------------------------------------------------------- /examples/inference/qwen_moe/convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/inference/qwen_moe/convert.py -------------------------------------------------------------------------------- /examples/llama/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/README.md -------------------------------------------------------------------------------- /examples/llama/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/llama/convert_hf_to_nanotron.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/convert_hf_to_nanotron.py -------------------------------------------------------------------------------- /examples/llama/convert_nanotron_to_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/convert_nanotron_to_hf.py -------------------------------------------------------------------------------- /examples/llama/convert_weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/convert_weights.py -------------------------------------------------------------------------------- /examples/llama/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.39.3 2 | -------------------------------------------------------------------------------- /examples/llama/tests/test_conversion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/tests/test_conversion.py -------------------------------------------------------------------------------- /examples/llama/tests/test_conversion.py.orig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/tests/test_conversion.py.orig -------------------------------------------------------------------------------- /examples/llama/tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/llama/tests/utils.py -------------------------------------------------------------------------------- /examples/mamba/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/README.md -------------------------------------------------------------------------------- /examples/mamba/assets/loss_mamba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/assets/loss_mamba.png -------------------------------------------------------------------------------- /examples/mamba/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/config.py -------------------------------------------------------------------------------- /examples/mamba/config_mamba.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/config_mamba.yaml -------------------------------------------------------------------------------- /examples/mamba/convert_hf_to_nanotron.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/convert_hf_to_nanotron.py -------------------------------------------------------------------------------- /examples/mamba/convert_nanotron_to_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/convert_nanotron_to_hf.py -------------------------------------------------------------------------------- /examples/mamba/create_config_mamba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/create_config_mamba.py -------------------------------------------------------------------------------- /examples/mamba/mamba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/mamba.py -------------------------------------------------------------------------------- /examples/mamba/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/requirements.txt -------------------------------------------------------------------------------- /examples/mamba/run_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/run_generate.py -------------------------------------------------------------------------------- /examples/mamba/run_multinode.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/run_multinode.sh -------------------------------------------------------------------------------- /examples/mamba/selective_scan_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/selective_scan_interface.py -------------------------------------------------------------------------------- /examples/mamba/train_mamba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/train_mamba.py -------------------------------------------------------------------------------- /examples/mamba/train_mamba.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/train_mamba.sh -------------------------------------------------------------------------------- /examples/mamba/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mamba/trainer.py -------------------------------------------------------------------------------- /examples/moe/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/README.md -------------------------------------------------------------------------------- /examples/moe/config_llamoe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/config_llamoe.py -------------------------------------------------------------------------------- /examples/moe/config_llamoe.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/config_llamoe.yaml -------------------------------------------------------------------------------- /examples/moe/llamoe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/llamoe.py -------------------------------------------------------------------------------- /examples/moe/moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/moe.py -------------------------------------------------------------------------------- /examples/moe/requirements.txt: -------------------------------------------------------------------------------- 1 | stanford-stk>=0.0.6 2 | megablocks==0.5.1 3 | -------------------------------------------------------------------------------- /examples/moe/train_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/moe/train_moe.py -------------------------------------------------------------------------------- /examples/mup/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/README.md -------------------------------------------------------------------------------- /examples/mup/assets/llama.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/assets/llama.png -------------------------------------------------------------------------------- /examples/mup/assets/scale-across-depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/assets/scale-across-depth.png -------------------------------------------------------------------------------- /examples/mup/assets/scale-across-width.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/assets/scale-across-width.png -------------------------------------------------------------------------------- /examples/mup/configs/mup_350m_llama_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/configs/mup_350m_llama_config.yaml -------------------------------------------------------------------------------- /examples/mup/configs/sp_350m_llama_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/mup/configs/sp_350m_llama_config.yaml -------------------------------------------------------------------------------- /examples/train_tiny_llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/examples/train_tiny_llama.sh -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/pyproject.toml -------------------------------------------------------------------------------- /run_evals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/run_evals.py -------------------------------------------------------------------------------- /run_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/run_generate.py -------------------------------------------------------------------------------- /run_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/run_train.py -------------------------------------------------------------------------------- /scripts/fix_checkpoint_bad_naming.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/scripts/fix_checkpoint_bad_naming.py -------------------------------------------------------------------------------- /scripts/log_lighteval_to_wandb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/scripts/log_lighteval_to_wandb.py -------------------------------------------------------------------------------- /scripts/scaling_benchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/scripts/scaling_benchmarks.py -------------------------------------------------------------------------------- /scripts/weka.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/scripts/weka.py -------------------------------------------------------------------------------- /slurm_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/slurm_launcher.py -------------------------------------------------------------------------------- /src/nanotron/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4" 2 | -------------------------------------------------------------------------------- /src/nanotron/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/__init__.py -------------------------------------------------------------------------------- /src/nanotron/config/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/config.py -------------------------------------------------------------------------------- /src/nanotron/config/lighteval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/lighteval_config.py -------------------------------------------------------------------------------- /src/nanotron/config/models_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/models_config.py -------------------------------------------------------------------------------- /src/nanotron/config/parallelism_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/parallelism_config.py -------------------------------------------------------------------------------- /src/nanotron/config/utils_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/config/utils_config.py -------------------------------------------------------------------------------- /src/nanotron/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/constants.py -------------------------------------------------------------------------------- /src/nanotron/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/__init__.py -------------------------------------------------------------------------------- /src/nanotron/data/clm_collator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/clm_collator.py -------------------------------------------------------------------------------- /src/nanotron/data/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/dataloader.py -------------------------------------------------------------------------------- /src/nanotron/data/dataloader_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/dataloader_builder.py -------------------------------------------------------------------------------- /src/nanotron/data/nanoset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nanoset.py -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/Makefile -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/__init__.py -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/blendable_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/blendable_dataset.py -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/dataset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/dataset_utils.py -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/helpers.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/helpers.cpp -------------------------------------------------------------------------------- /src/nanotron/data/nemo_dataset/indexed_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/nemo_dataset/indexed_dataset.py -------------------------------------------------------------------------------- /src/nanotron/data/processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/processing.py -------------------------------------------------------------------------------- /src/nanotron/data/s3_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/s3_utils.py -------------------------------------------------------------------------------- /src/nanotron/data/samplers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/samplers.py -------------------------------------------------------------------------------- /src/nanotron/data/sft_processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/sft_processing.py -------------------------------------------------------------------------------- /src/nanotron/data/tokenized_bytes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/tokenized_bytes.py -------------------------------------------------------------------------------- /src/nanotron/data/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/data/utils.py -------------------------------------------------------------------------------- /src/nanotron/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/distributed.py -------------------------------------------------------------------------------- /src/nanotron/eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/eval/README.md -------------------------------------------------------------------------------- /src/nanotron/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/eval/__init__.py -------------------------------------------------------------------------------- /src/nanotron/eval/evaluation_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/eval/evaluation_tasks.py -------------------------------------------------------------------------------- /src/nanotron/eval/one_job_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/eval/one_job_runner.py -------------------------------------------------------------------------------- /src/nanotron/eval/upload_to_wandb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/eval/upload_to_wandb.py -------------------------------------------------------------------------------- /src/nanotron/fp8/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/__init__.py -------------------------------------------------------------------------------- /src/nanotron/fp8/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/constants.py -------------------------------------------------------------------------------- /src/nanotron/fp8/dtypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/dtypes.py -------------------------------------------------------------------------------- /src/nanotron/fp8/kernel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/kernel.py -------------------------------------------------------------------------------- /src/nanotron/fp8/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/linear.py -------------------------------------------------------------------------------- /src/nanotron/fp8/meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/meta.py -------------------------------------------------------------------------------- /src/nanotron/fp8/parameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/parameter.py -------------------------------------------------------------------------------- /src/nanotron/fp8/tensor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/tensor.py -------------------------------------------------------------------------------- /src/nanotron/fp8/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/fp8/utils.py -------------------------------------------------------------------------------- /src/nanotron/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/generation/__init__.py -------------------------------------------------------------------------------- /src/nanotron/generation/decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/generation/decode.py -------------------------------------------------------------------------------- /src/nanotron/generation/generate_store.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/generation/generate_store.py -------------------------------------------------------------------------------- /src/nanotron/generation/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/generation/sampler.py -------------------------------------------------------------------------------- /src/nanotron/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/helpers.py -------------------------------------------------------------------------------- /src/nanotron/logging/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/logging/__init__.py -------------------------------------------------------------------------------- /src/nanotron/logging/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/logging/base.py -------------------------------------------------------------------------------- /src/nanotron/logging/logmixin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/logging/logmixin.py -------------------------------------------------------------------------------- /src/nanotron/logging/timers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/logging/timers.py -------------------------------------------------------------------------------- /src/nanotron/metrics_logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/metrics_logging.py -------------------------------------------------------------------------------- /src/nanotron/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/models/__init__.py -------------------------------------------------------------------------------- /src/nanotron/models/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/models/base.py -------------------------------------------------------------------------------- /src/nanotron/models/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/models/llama.py -------------------------------------------------------------------------------- /src/nanotron/models/qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/models/qwen.py -------------------------------------------------------------------------------- /src/nanotron/models/starcoder2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/models/starcoder2.py -------------------------------------------------------------------------------- /src/nanotron/nn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nanotron/nn/activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/activations.py -------------------------------------------------------------------------------- /src/nanotron/nn/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/attention.py -------------------------------------------------------------------------------- /src/nanotron/nn/flex_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/flex_attention.py -------------------------------------------------------------------------------- /src/nanotron/nn/layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/layer_norm.py -------------------------------------------------------------------------------- /src/nanotron/nn/llama3_ring_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/llama3_ring_attention.py -------------------------------------------------------------------------------- /src/nanotron/nn/moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/moe.py -------------------------------------------------------------------------------- /src/nanotron/nn/ring_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/ring_attention.py -------------------------------------------------------------------------------- /src/nanotron/nn/ring_attention_lucidrain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/ring_attention_lucidrain.py -------------------------------------------------------------------------------- /src/nanotron/nn/rotary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/nn/rotary.py -------------------------------------------------------------------------------- /src/nanotron/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/__init__.py -------------------------------------------------------------------------------- /src/nanotron/optim/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/base.py -------------------------------------------------------------------------------- /src/nanotron/optim/clip_grads.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/clip_grads.py -------------------------------------------------------------------------------- /src/nanotron/optim/gradient_accumulator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/gradient_accumulator.py -------------------------------------------------------------------------------- /src/nanotron/optim/inherit_from_other_optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/inherit_from_other_optimizer.py -------------------------------------------------------------------------------- /src/nanotron/optim/named_optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/named_optimizer.py -------------------------------------------------------------------------------- /src/nanotron/optim/optimizer_from_gradient_accumulator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/optimizer_from_gradient_accumulator.py -------------------------------------------------------------------------------- /src/nanotron/optim/zero.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/optim/zero.py -------------------------------------------------------------------------------- /src/nanotron/parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/__init__.py -------------------------------------------------------------------------------- /src/nanotron/parallel/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/context.py -------------------------------------------------------------------------------- /src/nanotron/parallel/data_parallel/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/data_parallel/utils.py -------------------------------------------------------------------------------- /src/nanotron/parallel/parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/parameters.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/README.md -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/block.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/context_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/context_manager.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/engine.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/functional.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/functional.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/p2p.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/p2p.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/state.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/tensor_pointer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/tensor_pointer.py -------------------------------------------------------------------------------- /src/nanotron/parallel/pipeline_parallel/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/pipeline_parallel/utils.py -------------------------------------------------------------------------------- /src/nanotron/parallel/sharded_parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/sharded_parameters.py -------------------------------------------------------------------------------- /src/nanotron/parallel/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py -------------------------------------------------------------------------------- /src/nanotron/parallel/tensor_parallel/enum.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/tensor_parallel/enum.py -------------------------------------------------------------------------------- /src/nanotron/parallel/tensor_parallel/functional.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/tensor_parallel/functional.py -------------------------------------------------------------------------------- /src/nanotron/parallel/tensor_parallel/nn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/tensor_parallel/nn.py -------------------------------------------------------------------------------- /src/nanotron/parallel/tied_parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/tied_parameters.py -------------------------------------------------------------------------------- /src/nanotron/parallel/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/parallel/utils.py -------------------------------------------------------------------------------- /src/nanotron/random.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/random.py -------------------------------------------------------------------------------- /src/nanotron/s3_checkpoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/s3_checkpoints/__init__.py -------------------------------------------------------------------------------- /src/nanotron/s3_checkpoints/fsspec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/s3_checkpoints/fsspec.py -------------------------------------------------------------------------------- /src/nanotron/s3_checkpoints/s3_mover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/s3_checkpoints/s3_mover.py -------------------------------------------------------------------------------- /src/nanotron/sanity_checks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/sanity_checks.py -------------------------------------------------------------------------------- /src/nanotron/scaling/parametrization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/scaling/parametrization.py -------------------------------------------------------------------------------- /src/nanotron/serialize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/__init__.py -------------------------------------------------------------------------------- /src/nanotron/serialize/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/main.py -------------------------------------------------------------------------------- /src/nanotron/serialize/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/metadata.py -------------------------------------------------------------------------------- /src/nanotron/serialize/optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/optimizer.py -------------------------------------------------------------------------------- /src/nanotron/serialize/random.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/random.py -------------------------------------------------------------------------------- /src/nanotron/serialize/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/utils.py -------------------------------------------------------------------------------- /src/nanotron/serialize/weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/serialize/weights.py -------------------------------------------------------------------------------- /src/nanotron/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/trainer.py -------------------------------------------------------------------------------- /src/nanotron/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/src/nanotron/utils.py -------------------------------------------------------------------------------- /test_timer_decorator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/test_timer_decorator.py -------------------------------------------------------------------------------- /tests/fp8/test_fp8_parameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/fp8/test_fp8_parameter.py -------------------------------------------------------------------------------- /tests/fp8/test_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/fp8/test_linear.py -------------------------------------------------------------------------------- /tests/fp8/test_tensor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/fp8/test_tensor.py -------------------------------------------------------------------------------- /tests/helpers/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/context.py -------------------------------------------------------------------------------- /tests/helpers/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/data.py -------------------------------------------------------------------------------- /tests/helpers/distributed_tensor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/distributed_tensor.py -------------------------------------------------------------------------------- /tests/helpers/dummy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/dummy.py -------------------------------------------------------------------------------- /tests/helpers/exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/exception.py -------------------------------------------------------------------------------- /tests/helpers/llama_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/llama_helper.py -------------------------------------------------------------------------------- /tests/helpers/qwen_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/qwen_helper.py -------------------------------------------------------------------------------- /tests/helpers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/helpers/utils.py -------------------------------------------------------------------------------- /tests/kernels/run_layer_norm_convergence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/kernels/run_layer_norm_convergence.py -------------------------------------------------------------------------------- /tests/kernels/test_layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/kernels/test_layer_norm.py -------------------------------------------------------------------------------- /tests/nanoset/test_build_nanoset_dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/nanoset/test_build_nanoset_dataloader.py -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts=-n 35 3 | markers = 4 | fa2: FA2-related 5 | -------------------------------------------------------------------------------- /tests/test_base_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_base_model.py -------------------------------------------------------------------------------- /tests/test_checkpointing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_checkpointing.py -------------------------------------------------------------------------------- /tests/test_clip_grads.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_clip_grads.py -------------------------------------------------------------------------------- /tests/test_data_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_data_parallel.py -------------------------------------------------------------------------------- /tests/test_distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_distributed.py -------------------------------------------------------------------------------- /tests/test_modeling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_modeling.py -------------------------------------------------------------------------------- /tests/test_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_moe.py -------------------------------------------------------------------------------- /tests/test_optimizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_optimizer.py -------------------------------------------------------------------------------- /tests/test_optimizer_params_groups.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_optimizer_params_groups.py -------------------------------------------------------------------------------- /tests/test_p2p.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_p2p.py -------------------------------------------------------------------------------- /tests/test_parameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_parameter.py -------------------------------------------------------------------------------- /tests/test_parameters_accumulate_gradient_in_fp32.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_parameters_accumulate_gradient_in_fp32.py -------------------------------------------------------------------------------- /tests/test_parametrization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_parametrization.py -------------------------------------------------------------------------------- /tests/test_pipeline_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_pipeline_parallel.py -------------------------------------------------------------------------------- /tests/test_random_state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_random_state.py -------------------------------------------------------------------------------- /tests/test_serialize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_serialize.py -------------------------------------------------------------------------------- /tests/test_sft.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_sft.py -------------------------------------------------------------------------------- /tests/test_tensor_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_tensor_parallel.py -------------------------------------------------------------------------------- /tests/test_tie_weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_tie_weights.py -------------------------------------------------------------------------------- /tests/test_zero.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tests/test_zero.py -------------------------------------------------------------------------------- /tools/preprocess_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/nanotron/HEAD/tools/preprocess_data.py --------------------------------------------------------------------------------