├── .github ├── deployment-configs │ ├── deploy-deberta.yaml │ ├── deploy-flan-t5.yaml │ ├── deploy-gpt-j-pod-4.yaml │ ├── deploy-huggingface-paperspace.yaml │ ├── deploy-licenses.yaml │ ├── deploy-magma.yaml │ ├── deploy-optimum-7.1.yaml │ └── deploy-whisper.yaml ├── pull_request_template.md ├── test_configs │ └── image-config.yaml └── workflows │ ├── 1-static-checks.yml │ ├── 2-local-vpod-tests.yml │ ├── 3-probe-on-ps.yml │ ├── configs │ ├── pre-commit-config.yaml │ ├── pylint.rc │ └── ruff.toml │ ├── copy-notebooks-from-source.yml │ └── sync-repos.yml ├── .gradient ├── available_ipus.py ├── check_tier.py ├── notebook-tests.yaml ├── prepare-datasets.sh ├── settings.yaml └── symlink_config.json ├── LICENSE ├── README.md ├── README_first.ipynb ├── audio-processing ├── requirements.txt ├── wav2vec2-fine-tuning-checkpoint.ipynb └── wav2vec2-inference-checkpoint.ipynb ├── dolly2-instruction-following ├── Dolly2-an-OSS-instruction-LLM.ipynb ├── OpenAssistant-Pythia-12B-Chatbot.ipynb ├── api │ ├── __init__.py │ └── pipeline.py ├── config │ ├── __init__.py │ ├── config.py │ └── inference.yml ├── inference.py ├── modelling │ ├── __init__.py │ ├── attention.py │ ├── decoder.py │ ├── dolly_lm.py │ ├── dolly_model.py │ ├── embedding.py │ ├── feed_forward.py │ ├── hf_mapping.py │ └── rotary_pos_embed │ │ ├── __init__.py │ │ ├── common.hpp │ │ ├── rotary_pos_embed.cpp │ │ ├── rotary_pos_embed.hpp │ │ ├── rotary_pos_embed.py │ │ ├── rotary_pos_embed_binding.cpp │ │ ├── rotary_pos_embedx.cpp │ │ └── rotary_pos_embedx.hpp ├── requirements.txt ├── tests │ ├── conftest.py │ ├── integration │ │ ├── execution │ │ │ └── test_execution.py │ │ └── layers │ │ │ ├── test_attention_TP.py │ │ │ ├── test_decoder_block_TP.py │ │ │ ├── test_feed_forward_TP.py │ │ │ ├── test_lm_TP.py │ │ │ └── test_model_TP.py │ └── test_config.yml └── utils │ ├── __init__.py │ ├── setup.py │ └── simple_parsing_tools.py ├── gptj-text-generation ├── GPTJ-generative-inference.ipynb ├── GPTJ-group-quantized.ipynb ├── api.py ├── config │ ├── __init__.py │ ├── config.py │ ├── finetuning.yml │ └── inference.yml ├── data │ ├── __init__.py │ ├── data_utils.py │ ├── hf_data_utils.py │ └── mnli_data.py ├── finetuning.ipynb ├── finetuning.py ├── imgs │ ├── bs_buffers.png │ ├── data_parallelism.png │ ├── dp_tp.png │ ├── execution.jpg │ ├── gq-speed-accuracy-tradeoff.png │ ├── mnli_dataset.png │ ├── rts.png │ ├── tensor_parallelism.png │ ├── tp.jpg │ └── tp_dp_rts.png ├── inference.py ├── modelling │ ├── __init__.py │ ├── attention.py │ ├── decoder.py │ ├── embedding.py │ ├── feed_forward.py │ ├── gptj_lm.py │ ├── gptj_model.py │ └── hf_mapping.py ├── pytest.ini ├── requirements-dev.txt ├── requirements.txt ├── run_finetuning.py ├── run_inference.py ├── run_validation.py ├── tests │ ├── conftest.py │ ├── integration │ │ ├── execution │ │ │ ├── test_execution.py │ │ │ └── test_overfitting.py │ │ └── layers │ │ │ ├── test_attention_TP.py │ │ │ ├── test_decoder_block_TP.py │ │ │ ├── test_feed_forward_TP.py │ │ │ ├── test_gptj_TP.py │ │ │ └── test_lm_TP.py │ ├── test_config.yml │ └── unit │ │ └── test_dataloder.py ├── tests_serial │ ├── dataloader_checkpoints.py │ ├── distributed_sampler.py │ └── test_distributed_data.py └── utils │ ├── __init__.py │ ├── inference.py │ ├── pipeline.py │ ├── setup.py │ ├── simple_parsing_tools.py │ ├── trainer.py │ └── utils.py ├── image-classification ├── LICENSE └── image_classification.ipynb ├── images ├── folder_logo.png ├── go_emotions.png └── jupyter_logo.png ├── llama2-chatbot ├── .gitignore ├── LICENSE ├── api │ ├── __init__.py │ └── pipeline.py ├── config │ ├── __init__.py │ ├── config.py │ └── inference.yml ├── inference.py ├── llama2-inference.ipynb ├── modelling │ ├── __init__.py │ ├── attention.py │ ├── decoder.py │ ├── embedding.py │ ├── feed_forward.py │ ├── hf_mapping.py │ ├── llama_lm.py │ ├── llama_model.py │ ├── rms_norm.py │ └── rotary_pos_embed │ │ ├── .rendered.rotary_pos_embed_binding.cpp │ │ ├── __init__.py │ │ ├── common.hpp │ │ ├── rotary_pos_embed.cpp │ │ ├── rotary_pos_embed.hpp │ │ ├── rotary_pos_embed.py │ │ ├── rotary_pos_embed_binding.cpp │ │ ├── rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so │ │ ├── rotary_pos_embedx.cpp │ │ └── rotary_pos_embedx.hpp ├── pytest.ini ├── requirements.txt ├── run-inference.py ├── tests │ ├── conftest.py │ ├── integration │ │ ├── execution │ │ │ └── test_execution.py │ │ └── layers │ │ │ ├── test_attention_TP.py │ │ │ ├── test_decoder_block_TP.py │ │ │ ├── test_feed_forward_TP.py │ │ │ ├── test_lm_TP.py │ │ │ └── test_model_TP.py │ └── test_config.yml └── utils │ ├── __init__.py │ ├── setup.py │ └── simple_parsing_tools.py ├── molfeat ├── requirements.txt ├── transformers_molfeat_finetune.ipynb └── utils.py ├── multimodal └── magma │ ├── Image-description-using-MAGMA.ipynb │ ├── configs │ ├── MAGMA_v1.yml │ ├── __init__.py │ ├── config.py │ └── inference.yml │ ├── demo_example_images │ ├── cantaloupe_popsicle.jpg │ ├── circles.jpg │ ├── circles_square.jpg │ ├── korea.jpg │ ├── matterhorn.jpg │ ├── mushroom.jpg │ ├── people.jpg │ ├── playarea.jpg │ ├── popsicle.png │ ├── rainbow_popsicle.jpeg │ └── table_tennis.jpg │ ├── images │ ├── MagmaStructure.png │ └── demo_magma.png │ ├── inference.py │ ├── modelling │ ├── __init__.py │ ├── adapters_TP.py │ ├── clip_resnet │ │ ├── __init__.py │ │ ├── attention_pool.py │ │ ├── batch_norm.py │ │ ├── bottleneck.py │ │ ├── modified_resnet.py │ │ └── stem.py │ ├── gptj │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── decoder.py │ │ ├── embedding.py │ │ ├── feed_forward.py │ │ ├── finetuneanon_mapping.py │ │ ├── gptj_lm.py │ │ └── gptj_model.py │ ├── image_prefix.py │ └── magma_mapping.py │ ├── requirements.txt │ ├── run_inference.py │ └── utils │ ├── __init__.py │ ├── sampling.py │ ├── setup.py │ └── simple_parsing_tools.py ├── natural-language-processing ├── Flan-T5-generative-inference.ipynb ├── LICENSE ├── doing-more-with-flan-t5 │ ├── Flan-T5-generative-inference.ipynb │ ├── Flan-T5-textual-entailment-fine-tuning.ipynb │ ├── api.py │ ├── config │ │ ├── __init__.py │ │ ├── config.py │ │ ├── finetuning.yml │ │ └── inference.yml │ ├── data │ │ ├── __init__.py │ │ ├── data_utils.py │ │ └── mnli_data.py │ ├── finetuning.py │ ├── graphs │ │ ├── __init__.py │ │ ├── embedding.py │ │ ├── encoder_decoder.py │ │ ├── graphs.py │ │ └── head.py │ ├── imgs │ │ └── mnli_dataset.png │ ├── inference.py │ ├── modelling │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── embedding.py │ │ ├── encoder_decoder.py │ │ ├── feed_forward.py │ │ ├── hf_mapping.py │ │ ├── layer_norm.py │ │ ├── t5_lm.py │ │ └── t5_model.py │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── run_finetuning.py │ ├── run_validation.py │ └── utils │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── pipeline.py │ │ ├── setup.py │ │ ├── simple_parsing_tools.py │ │ ├── trainer.py │ │ └── utils.py ├── images │ ├── bert-pipelining.png │ ├── bert.png │ ├── causal_language_modeling.png │ ├── masked_language_modeling.png │ ├── name_entity_extraction.png │ ├── partitioning.jpg │ ├── pipelining.png │ ├── question_answering.png │ ├── recomputation.png │ ├── rts.png │ ├── squad.png │ ├── summarization.png │ ├── t5_vs_flan_t5.png │ ├── text_classification.png │ ├── token_classification.png │ └── translation.png ├── introduction_to_optimum_graphcore.ipynb ├── name-entity-extraction.ipynb ├── other-use-cases │ ├── deberta-blog-notebook.ipynb │ ├── external_model.ipynb │ ├── images │ │ ├── bert-pipelining.png │ │ ├── bert.png │ │ ├── causal_language_modeling.png │ │ ├── masked_language_modeling.png │ │ ├── mt5_oom.png │ │ ├── name_entity_extraction.png │ │ ├── partitioning.jpg │ │ ├── pipelining.png │ │ ├── question_answering.png │ │ ├── recomputation.png │ │ ├── restart_kernel.png │ │ ├── rts.png │ │ ├── squad.png │ │ ├── summarization.png │ │ ├── text_classification.png │ │ ├── token_classification.png │ │ └── translation.png │ ├── language_modelling_from_scratch.ipynb │ ├── mt5_translation.ipynb │ ├── mt5_xnli.ipynb │ ├── multiple_choice.ipynb │ ├── question_answering.ipynb │ ├── summarization.ipynb │ ├── text_classification.ipynb │ └── token_classification.ipynb ├── sentiment_analysis.ipynb ├── squad_preprocessing.py ├── text-embeddings-models │ ├── config.py │ └── text-embeddings-on-ipu.ipynb ├── text_summarization_BART_L_inference.ipynb └── translation.ipynb ├── packed-bert ├── LICENSE ├── __init__.py ├── models │ ├── __init__.py │ └── modeling_bert_packed.py ├── packedBERT_multi_label_text_classification.ipynb ├── packedBERT_question_answering.ipynb ├── packedBERT_single_label_text_classification.ipynb ├── pipeline │ ├── __init__.py │ └── packed_bert.py └── utils │ ├── __init__.py │ └── packing │ ├── __init__.py │ ├── algorithms.py │ ├── dataset_creator.py │ ├── dataset_templates.py │ └── qa_utils.py ├── setup.sh ├── stable-diffusion ├── LICENSE ├── image_to_image.ipynb ├── inpainting.ipynb ├── requirements.txt ├── sample_images │ ├── image_to_image.png │ ├── inpainting.png │ ├── text_to_image.png │ └── text_to_image_sd2.png ├── text_to_image.ipynb └── text_to_image_sd2.ipynb ├── useful-tips ├── images │ ├── connect-tunnel-from-web-1.png │ ├── connect-tunnel-from-web-2.png │ ├── connect-tunnel-to-app-1.png │ ├── connect-tunnel-to-app-2.png │ ├── login-code.png │ ├── login-success.png │ ├── restart_kernel.png │ └── tunnel-unregister.png ├── managing_ipu_resources.ipynb └── using_vscode_in_paperspace.ipynb └── whisper ├── LICENSE ├── whisper-example.ipynb ├── whisper-quantized-example.ipynb └── whisper_finetuning.ipynb /.github/deployment-configs/deploy-deberta.yaml: -------------------------------------------------------------------------------- 1 | _optimum_graphcore_repository: &_optimum_graphcore_repository 2 | origin: https://github.com/huggingface/optimum-graphcore.git 3 | ref: main 4 | 5 | _current_repo_in_github_actions: &_current_repo_in_github_actions 6 | origin: notebooks/ 7 | ref: null 8 | 9 | deberta-lukem: 10 | source: 11 | paths: 12 | - expression: '*' 13 | path: notebooks/deberta-blog-notebook.ipynb 14 | recursive: true 15 | repository: 16 | origin: https://github.com/huggingface/optimum-graphcore.git 17 | prefix: notebooks/ 18 | ref: main 19 | target: 20 | renames: {} 21 | repository: 22 | <<: *_current_repo_in_github_actions 23 | prefix: natural-language-processing/other-use-cases/ 24 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-flan-t5.yaml: -------------------------------------------------------------------------------- 1 | _examples_internal_repository: &_examples_internal_repository 2 | origin: examples-internal/ 3 | ref: null 4 | 5 | 6 | _common_target_repository: &_common_target_repository 7 | origin: notebooks/ 8 | ref: null 9 | 10 | 11 | flan-t5: 12 | source: 13 | paths: 14 | - expression: '*' 15 | path: nlp/t5/popxl 16 | recursive: true 17 | excludes: 18 | - path: nlp/t5/popxl/pytest.ini 19 | - path: nlp/t5/popxl/.ci 20 | - path: nlp/t5/popxl/README.md 21 | - path: nlp/t5/popxl/.gitignore 22 | - path: nlp/t5/popxl/tests 23 | repository: 24 | <<: *_examples_internal_repository 25 | prefix: nlp/t5/popxl 26 | target: 27 | renames: {} 28 | repository: 29 | <<: *_common_target_repository 30 | prefix: natural-language-processing/doing-more-with-flan-t5/ 31 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-gpt-j-pod-4.yaml: -------------------------------------------------------------------------------- 1 | 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository 3 | origin: https://github.com/huggingface/optimum-graphcore.git 4 | ref: main 5 | 6 | _examples_internal_repository: &_examples_internal_repository 7 | origin: examples-internal/ 8 | ref: null 9 | 10 | 11 | _common_target_repository: &_common_target_repository 12 | origin: notebooks/ 13 | ref: null 14 | 15 | gptj: 16 | source: 17 | paths: 18 | - path: nlp/gpt_j/popxl/GPTJ-generative-inference.ipynb 19 | - path: nlp/gpt_j/popxl/config/inference.yml 20 | excludes: 21 | - path: nlp/gpt_j/popxl/README.md 22 | - path: nlp/gpt_j/popxl/.gitignore 23 | - path: nlp/gpt_j/popxl/.ci 24 | repository: 25 | <<: *_examples_internal_repository 26 | prefix: nlp/gpt_j/popxl 27 | target: 28 | renames: {} 29 | repository: 30 | <<: *_common_target_repository 31 | prefix: gptj-text-generation 32 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-licenses.yaml: -------------------------------------------------------------------------------- 1 | 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository 3 | origin: https://github.com/huggingface/optimum-graphcore.git 4 | ref: main 5 | 6 | _examples_internal_repository: &_examples_internal_repository 7 | origin: examples-internal/ 8 | ref: null 9 | 10 | 11 | _common_target_repository: &_common_target_repository 12 | origin: notebooks/ 13 | ref: null 14 | 15 | _copy_apache_license: &_copy_apache_license 16 | paths: 17 | - expression: '*' 18 | path: LICENSE 19 | recursive: true 20 | repository: 21 | <<: *_optimum_graphcore_repository 22 | prefix: '' 23 | 24 | license-audio-classification: 25 | source: 26 | <<: *_copy_apache_license 27 | target: 28 | renames: {} 29 | repository: 30 | <<: *_common_target_repository 31 | prefix: 'audio-classification' 32 | 33 | license-audio-classification: 34 | source: 35 | <<: *_copy_apache_license 36 | target: 37 | renames: {} 38 | repository: 39 | <<: *_common_target_repository 40 | prefix: 'audio-classification' 41 | 42 | 43 | license-audio-classification: 44 | source: 45 | <<: *_copy_apache_license 46 | target: 47 | renames: {} 48 | repository: 49 | <<: *_common_target_repository 50 | prefix: 'audio-classification' 51 | 52 | license-natural-language-processing: 53 | source: 54 | <<: *_copy_apache_license 55 | target: 56 | renames: {} 57 | repository: 58 | <<: *_common_target_repository 59 | prefix: 'natural-language-processing' 60 | 61 | license-image-classification: 62 | source: 63 | <<: *_copy_apache_license 64 | target: 65 | renames: {} 66 | repository: 67 | <<: *_common_target_repository 68 | prefix: 'image-classification' 69 | 70 | license-packed-bert: 71 | source: 72 | <<: *_copy_apache_license 73 | target: 74 | renames: {} 75 | repository: 76 | <<: *_common_target_repository 77 | prefix: 'packed-bert' 78 | 79 | license-stable-diffusion: 80 | source: 81 | <<: *_copy_apache_license 82 | target: 83 | renames: {} 84 | repository: 85 | <<: *_common_target_repository 86 | prefix: 'stable-diffusion' 87 | 88 | license-whisper: 89 | source: 90 | <<: *_copy_apache_license 91 | target: 92 | renames: {} 93 | repository: 94 | <<: *_common_target_repository 95 | prefix: 'whisper' 96 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-magma.yaml: -------------------------------------------------------------------------------- 1 | _examples_internal_repository: &_examples_internal_repository 2 | origin: examples-internal/ 3 | ref: null 4 | 5 | 6 | _common_target_repository: &_common_target_repository 7 | origin: notebooks/ 8 | ref: null 9 | 10 | 11 | flan-t5: 12 | source: 13 | paths: 14 | - expression: '*' 15 | path: multimodal/magma/popxl 16 | recursive: true 17 | excludes: 18 | - path: multimodal/magma/popxl/pytest.ini 19 | - path: multimodal/magma/popxl/.ci 20 | - path: multimodal/magma/popxl/README.md 21 | - path: multimodal/magma/popxl/.gitignore 22 | - path: multimodal/magma/popxl/tests 23 | repository: 24 | <<: *_examples_internal_repository 25 | prefix: multimodal/magma/popxl/ 26 | target: 27 | renames: {} 28 | repository: 29 | <<: *_common_target_repository 30 | prefix: multimodal/magma 31 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-optimum-7.1.yaml: -------------------------------------------------------------------------------- 1 | 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository 3 | origin: https://github.com/huggingface/optimum-graphcore.git 4 | ref: main 5 | 6 | _examples_internal_repository: &_examples_internal_repository 7 | origin: examples-internal/ 8 | ref: null 9 | 10 | 11 | _common_target_repository: &_common_target_repository 12 | origin: notebooks/ 13 | ref: null 14 | 15 | whisper_quantized: 16 | source: 17 | paths: 18 | - expression: '*' 19 | path: notebooks/whisper-quantized-example.ipynb 20 | recursive: true 21 | repository: 22 | <<: *_optimum_graphcore_repository 23 | prefix: notebooks/ 24 | target: 25 | renames: {} 26 | repository: 27 | <<: *_common_target_repository 28 | prefix: whisper/ 29 | 30 | natural-language-processing-main: 31 | source: 32 | paths: 33 | - path: notebooks/text_summarization.ipynb 34 | repository: 35 | <<: *_optimum_graphcore_repository 36 | prefix: notebooks/ 37 | target: 38 | renames: {natural-language-processing/text_summarization.ipynb: natural-language-processing/text_summarization_BART_L_inference.ipynb} 39 | repository: 40 | <<: *_common_target_repository 41 | prefix: natural-language-processing/ 42 | 43 | text_embeddings_models: 44 | source: 45 | paths: 46 | - expression: '*' 47 | path: notebooks/text_embeddings_models/config.py 48 | path: notebooks/text_embeddings_models/text-embeddings-on-ipu.ipynb 49 | recursive: true 50 | repository: 51 | <<: *_optimum_graphcore_repository 52 | prefix: notebooks/text_embeddings_models/ 53 | target: 54 | repository: 55 | <<: *_common_target_repository 56 | prefix: natural-language-processing/text-embeddings-models/ 57 | -------------------------------------------------------------------------------- /.github/deployment-configs/deploy-whisper.yaml: -------------------------------------------------------------------------------- 1 | 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository 3 | origin: https://github.com/huggingface/optimum-graphcore.git 4 | ref: main 5 | 6 | _examples_internal_repository: &_examples_internal_repository 7 | origin: examples-internal/ 8 | ref: null 9 | 10 | 11 | _common_target_repository: &_common_target_repository 12 | origin: notebooks/ 13 | ref: null 14 | 15 | whisper: 16 | source: 17 | paths: 18 | - expression: '*' 19 | path: notebooks/whisper-example.ipynb 20 | recursive: true 21 | repository: 22 | <<: *_optimum_graphcore_repository 23 | prefix: notebooks/ 24 | target: 25 | renames: {} 26 | repository: 27 | <<: *_common_target_repository 28 | prefix: early-access/whisper/ 29 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Adding a new notebook checklist 2 | 3 | [Contributing a notebook on confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3242393645/Contributing+a+notebook) contains full instructions, if you have questions please ask on #internal-paperspace-graphcore by tagging @aie-paperspace, here is the checklist: 4 | 5 | - [ ] Your notebook should exist and have been landed in another repository (examples or optimum-graphcore) - (this can be skipped in rare instances) 6 | - [ ] Make it configurable by environment variables [see notebook technical guidelines](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3098345498/Writing+a+Paperspace+notebook#Reading-configuration-in-notebooks) 7 | - [ ] Make sure it has a compliant title [See notebook content guidelines](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3094381247/Notebooks+guidelines) 8 | - [ ] Add an entry to `.github/deployment-configs/` to copy it over - this will create a PR with files automatically copied over for you. You will need to merge it into your branch (simply click the merge button on the automated PR, it will do the right thing). The config format is defined in [graphcore/paperspace-automation - deployment](https://github.com/graphcore/paperspace-automation/tree/main/deployment) 9 | - [ ] remove READMEs (they do not render on Paperspace) 10 | - [ ] make sure appropriate licence is included (MIT: no action needed, other licenses need to be added to folder) 11 | - [ ] Once the file structure matches what you want, merge the PR that was automatically created, ask for feedback from #internal-paperspace-graphcore if you are not sure about the file structure to adopt 12 | - [ ] Generate a short link [confluence instructions](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3219194169/Generating+a+short+URL+for+a+Paperspace+notebook) and add a ROG button on the notebook 13 | - [ ] Make minimal Paperspace specific changes 14 | - [ ] remove relative links in Markdown text (unsupported on Paperspace). Either use full URLs to github, or print the relative path as code e.g. "... the notebook at `../tutorial3/walkthrough.ipynb`" 15 | - [ ] unpin matplotlib, pandas and numpy requirements 16 | - [ ] Make sure the graphcore-cloud-tools logger is added 17 | - [ ] Add an entry to test the notebook in `.gradient/notebooks-tests.yaml` 18 | - [ ] Add the notebook to the `README_first.ipynb` 19 | - [ ] Dataset, checkpoint, poplar cache upload ([dataset management - confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3226206448/Paperspace+dataset+management)) 20 | - [ ] Upload any required datasets, checkpoints and caches to `/a/scratch/ai-public-datasets` 21 | - [ ] Symlink any new datasets by editing `.gradient/symlink_config.json`, symlinks are from the read only `PUBLIC_DATASETS_DIR` to the appropriate read/write equivalent `DATASETS_DIR`, `CHECKPOINT_DIR`, `HF_DATASETS_CACHE`, etc... (see `setup.sh` for possibilities) 22 | - [ ] if you need new environment variables defined, make changes to `setup.sh` 23 | - [ ] Download files generated during the CI run which will be cached from AWS ([download - AWS data - confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3226206448/Paperspace+dataset+management#Accessing-artefacts-generated-in-Github-actions)) 24 | - [ ] Upload datasets, checkpoints and other caches to gradient datasets 25 | - [ ] If you have created a new dataset, add corresponding entry to `.gradient/settings.yaml` 26 | - [ ] Test on Paperspace: you can trigger a test on Paperspace by using the "workflow dispatch" trigger in Github Actions and changing "Local" to "Paperspace" (you can also do this manually) 27 | 28 | Once all this is done, or steps have been agreed to be unnecessary, merge this PR 🙂 29 | 30 | Don't forget to tell #internal-paperspace-graphcore that the PR has landed 31 | -------------------------------------------------------------------------------- /.github/test_configs/image-config.yaml: -------------------------------------------------------------------------------- 1 | # This config file allows you to specify whether notebooks need to be run with different configs in CI. 2 | # This is to allow us to test notebooks with different parameters - such as changing docker images 3 | # - `default` is related to the default CI testing infrastructure, with the currently released SDK, fill this if you would like to test on both the default and a specific config 4 | # - `early-access-tests` use an early release docker container and will run tests for these notebooks separately. 5 | 6 | default: 7 | test_names: [] 8 | 9 | 10 | -------------------------------------------------------------------------------- /.github/workflows/2-local-vpod-tests.yml: -------------------------------------------------------------------------------- 1 | name: 2. Tests on Local vPOD 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | test_env: 7 | type: choice 8 | description: "Testing environment" 9 | required: false 10 | default: 'Local' 11 | options: 12 | - Local 13 | - Paperspace 14 | docker_image: 15 | type: string 16 | description: "Docker image used in notebook testing" 17 | required: false 18 | default: "graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703" 19 | notebooks: 20 | type: string 21 | description: "List of notebooks to test in JSON format" 22 | required: false 23 | default: '["Graphcore-HuggingFace-README_first"]' 24 | machine_types: 25 | type: string 26 | description: "List of machines types" 27 | required: false 28 | default: '["IPU-POD4"]' 29 | test_mode: 30 | type: string 31 | description: "The test workload that we are running, default or config set in the .github/test_configs/image-config.yaml" 32 | required: false 33 | test_config: 34 | type: string 35 | description: "Config which can be used to define special parameters such as docker image." 36 | default: ".github/test_configs/image-config.yaml" 37 | required: false 38 | local_cache_type: 39 | type: choice 40 | description: "Use PURE filesystem mount or s3 caches, s3 cache takes 5mn." 41 | required: false 42 | options: 43 | - mount 44 | - s3 45 | 46 | pull_request: 47 | branches-ignore: 48 | - 'gh-action-branches/**' 49 | schedule: 50 | # run at 7:00 PM GMT every night 51 | - cron: '0 19 * * TUE,FRI' 52 | 53 | 54 | jobs: 55 | tests: 56 | uses: graphcore/paperspace-automation/.github/workflows/subwf-vpod-tests-for-nb-repo.yml@main 57 | with: 58 | docker_image: ${{ inputs.docker_image || 'graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703' }} 59 | notebooks: ${{ inputs.notebooks }} 60 | machine_types: ${{ inputs.machine_types }} 61 | test_env: ${{ inputs.test_env || 'Local' }} 62 | test_mode: "default" 63 | test_config: ${{ inputs.test_config || '.github/test_configs/image-config.yaml' }} 64 | # Use mounts on PRs as they are faster and s3 in nightlies as they are more representative 65 | local_cache_type: ${{ inputs.local_cache_type || (github.event_name == 'pull_request' && 'mount') || 's3' }} 66 | secrets: 67 | gh_user: ${{ secrets.GH_TOKEN_USER }} 68 | gh_token: ${{ secrets.GH_TOKEN_SYNC_REPOS }} 69 | hugging_face_hub_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} 70 | slack_bot_token: ${{ secrets.SLACK_BOT_TOKEN }} 71 | ci_slack_channel_id: ${{ secrets.CI_SLACK_CHANNEL_ID }} 72 | aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} 73 | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 74 | dataset_s3_download_b64_credential: ${{ secrets.DATASET_S3_DOWNLOAD_B64_CREDENTIAL }} 75 | paperspace_api_key: ${{ secrets.PAPERSPACE_API_KEY }} 76 | gradient_ui_email: ${{ secrets.GRADIENT_UI_EMAIL }} 77 | gradient_ui_password: ${{ secrets.GRADIENT_UI_PASSWORD }} 78 | gradient_validation_key: ${{ secrets.GRADIENT_VALIDATION_KEY }} 79 | 80 | -------------------------------------------------------------------------------- /.github/workflows/3-probe-on-ps.yml: -------------------------------------------------------------------------------- 1 | name: 3. Probe on PS env 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | docker_image: 7 | type: string 8 | description: "Docker image used in notebook testing" 9 | required: false 10 | default: "graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703" 11 | pull_request: 12 | schedule: 13 | # run every 6h and at 9am 14 | - cron: '05 0,6,9,12,18 * * *' 15 | 16 | 17 | jobs: 18 | probe-in-ps: 19 | name: Probe in PS env 20 | uses: graphcore/paperspace-automation/.github/workflows/subwf-probe-in-ps.yml@main 21 | with: 22 | docker_image: ${{ inputs.docker_image || 'graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703' }} 23 | secrets: 24 | gh_user: ${{ secrets.GH_TOKEN_USER }} 25 | gh_token: ${{ secrets.GH_TOKEN_SYNC_REPOS }} 26 | paperspace_api_key: ${{ secrets.PAPERSPACE_API_KEY }} 27 | gradient_ui_email: ${{ secrets.GRADIENT_UI_EMAIL }} 28 | gradient_ui_password: ${{ secrets.GRADIENT_UI_PASSWORD }} 29 | gradient_validation_key: ${{ secrets.GRADIENT_VALIDATION_KEY }} 30 | slack_bot_token: ${{ secrets.SLACK_BOT_TOKEN }} 31 | ci_slack_channel_id: ${{ secrets.CI_SLACK_CHANNEL_ID }} 32 | -------------------------------------------------------------------------------- /.github/workflows/configs/pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: The versions can be updated by calling 2 | # pre-commit autoupdate 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.4.0 6 | hooks: 7 | - id: check-json 8 | - id: check-yaml 9 | - id: check-merge-conflict 10 | - id: debug-statements 11 | - id: check-added-large-files 12 | - id: end-of-file-fixer 13 | - id: mixed-line-ending 14 | args: ["--fix=lf"] 15 | - id: trailing-whitespace 16 | - id: detect-private-key 17 | 18 | - repo: https://github.com/psf/black 19 | rev: 22.12.0 20 | hooks: 21 | - id: black 22 | args: [--line-length, "120", --skip-string-normalization] 23 | - id: black-jupyter 24 | files: '.*\.ipynb' 25 | - repo: https://github.com/codespell-project/codespell 26 | rev: v2.2.4 27 | hooks: 28 | - id: codespell 29 | -------------------------------------------------------------------------------- /.github/workflows/configs/ruff.toml: -------------------------------------------------------------------------------- 1 | # Enable flake8-bugbear (`B`) rules. 2 | #select = ["E", "F", "B"] 3 | 4 | # Never enforce `E501` (line length violations). 5 | ignore = ["E501"] 6 | 7 | # Avoid trying to fix flake8-bugbear (`B`) violations. 8 | #unfixable = ["B"] 9 | 10 | # Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`. 11 | #[per-file-ignores] 12 | #"__init__.py" = ["E402"] 13 | #"path/to/file.py" = ["E402"] 14 | -------------------------------------------------------------------------------- /.github/workflows/copy-notebooks-from-source.yml: -------------------------------------------------------------------------------- 1 | name: Copy notebooks from source repos 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | deployment_spec: 7 | required: false 8 | type: string 9 | description: "Spec file to be read for copying notebooks, must be valid input to deployment/deploy.py" 10 | default: .github/deployment-configs/deploy-deberta.yaml 11 | pull_request: 12 | 13 | 14 | jobs: 15 | tests: 16 | uses: graphcore/paperspace-automation/.github/workflows/copy-to-nb-repo.yml@main 17 | with: 18 | deployment_spec: ${{ inputs.deployment_spec }} 19 | secrets: 20 | gh_token: ${{ secrets.GH_TOKEN_SYNC_REPOS }} 21 | -------------------------------------------------------------------------------- /.github/workflows/sync-repos.yml: -------------------------------------------------------------------------------- 1 | name: Sync to public repo 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | schedule: 9 | # run at 10:00 PM GMT every night 10 | - cron: '0 22 * * *' 11 | 12 | jobs: 13 | 14 | sync-repos: 15 | name: Sync to public repo 16 | runs-on: 'ubuntu-latest' 17 | steps: 18 | - uses: actions/checkout@v3 19 | with: 20 | fetch-depth: 0 21 | token: ${{ secrets.GH_TOKEN_SYNC_REPOS }} 22 | - name: Sync repos 23 | env: 24 | DEST_REPO_URL: https://${{ secrets.GH_TOKEN_USER }}:${{ secrets.GH_TOKEN_SYNC_REPOS }}@github.com/graphcore/Gradient-HuggingFace 25 | run: | 26 | # checkout all remote branches 27 | git checkout main 28 | for BRANCH in $(git branch -a | grep remotes | grep -v HEAD | grep -v main); do 29 | git branch --force --track "${BRANCH#remotes/origin/}" "${BRANCH}" 30 | done 31 | 32 | # remove pull refs that are for Pull Requests, GitHub does not accept them 33 | git for-each-ref --format 'delete %(refname)' refs/pull | git update-ref --stdin 34 | 35 | git remote add target "${DEST_REPO_URL}" 36 | 37 | git push --mirror target 38 | -------------------------------------------------------------------------------- /.gradient/available_ipus.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | import subprocess 3 | import json 4 | import warnings 5 | import os 6 | 7 | try: 8 | j = subprocess.check_output(['gc-monitor', '-j'], timeout=10) 9 | data = json.loads(j) 10 | num_ipuMs = len(data["cards"]) 11 | num_ipus = 4 * num_ipuMs 12 | except subprocess.TimeoutExpired as err: 13 | num_ipus = 0 14 | print(num_ipus) 15 | nb_id = os.getenv("PAPERSPACE_METRIC_WORKLOAD_ID", "unknown") 16 | raise OSError( 17 | "Connection to IPUs timed-out. This error indicates a problem with the " 18 | "hardware you are running on. Please contact Paperspace Support referencing" 19 | f" the Notebook ID: {nb_id}" 20 | ) from err 21 | # to be captured as a variable in the bash script that calls this python script 22 | print(num_ipus) -------------------------------------------------------------------------------- /.gradient/check_tier.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 3 | 4 | import os 5 | 6 | hostname = os.getenv("HOSTNAME", "unknown") 7 | 8 | # Free tier hosts 9 | free_hostnames = [f"lr17-1-poplar-{i}" for i in range(1, 36)] 10 | free_hostnames.append("lr17-1-poplar-63") 11 | free_hostnames.append("lr17-1-poplar-64") 12 | 13 | if hostname in free_hostnames: 14 | print("FREE") 15 | else: 16 | print("PAID") 17 | -------------------------------------------------------------------------------- /.gradient/prepare-datasets.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -uxo pipefail 3 | run-tests() { 4 | # we do not exit on errors to make sure Paperspace notebooks get terminated 5 | set +e 6 | echo "PAPERSPACE-AUTOMATED-TESTING: Started testing" 7 | if [ "${8}" == "unset" ]; then 8 | EXAMPLES_UTILS_REV=latest_stable 9 | else 10 | EXAMPLES_UTILS_REV=${8} 11 | fi 12 | python -m pip install gradient 13 | python -m pip install "examples-utils[jupyter] @ git+https://github.com/graphcore/examples-utils@${EXAMPLES_UTILS_REV}" 14 | 15 | # set variable matching the standard Paperspace entry point 16 | export PIP_DISABLE_PIP_VERSION_CHECK=1 17 | export HUGGING_FACE_HUB_TOKEN=${7} 18 | export VIRTUAL_ENV="/some/fake/venv/GC-automated-paperspace-test-${4}" 19 | 20 | LOG_FOLDER="${5}/log_${4}_$(date +'%Y-%m-%d-%H_%M_%S')" 21 | mkdir -p ${LOG_FOLDER} 22 | TEST_CONFIG_FILE="${6}" 23 | # Run the health check script 24 | HEALTH_CHECK_LOG_FOLDER="/storage/graphcore_health_checks" 25 | python -m graphcore_cloud_tools.paperspace_utils.health_check --log-folder ${HEALTH_CHECK_LOG_FOLDER} 26 | # Copy the health check logs to local log folder 27 | HEALTH_CHECK_LOG_FILE=$(find ${HEALTH_CHECK_LOG_FOLDER} -type f | sort -n | tail -1) 28 | cp ${HEALTH_CHECK_LOG_FILE} ${LOG_FOLDER} 29 | 30 | cd /notebooks/ 31 | echo "PAPERSPACE-AUTOMATED-TESTING: starting platform_assessment testing" 32 | python -m examples_utils platform_assessment --spec ${TEST_CONFIG_FILE} "${@:9}" \ 33 | --log-dir $LOG_FOLDER \ 34 | --gc-monitor \ 35 | --cloning-directory /tmp/clones \ 36 | --additional-metrics 37 | 38 | exit_code=$? 39 | tar -czvf "${LOG_FOLDER}.tar.gz" ${LOG_FOLDER} 40 | echo "PAPERSPACE-AUTOMATED-TESTING: Testing complete with exit code ${exit_code}" 41 | echo "Shutting down notebook" 42 | 43 | if [ "${PAPERSPACE_METRIC_WORKLOAD_ID:-}" ] 44 | then 45 | sleep 5 46 | gradient apiKey ${1} 47 | gradient notebooks stop --id ${PAPERSPACE_METRIC_WORKLOAD_ID} 48 | fi 49 | echo "Notebook Stopped" 50 | } 51 | 52 | if [ ! "$(command -v fuse-overlayfs)" ]; then 53 | echo "fuse-overlayfs not found installing - please update to our latest image" 54 | apt update -y 55 | apt install -o DPkg::Lock::Timeout=120 -y psmisc libfuse3-dev fuse-overlayfs 56 | fi 57 | 58 | python -m pip install "graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3" 59 | 60 | echo "Starting preparation of datasets" 61 | SCRIPT_DIR="$( dirname -- "${BASH_SOURCE[0]}" )" 62 | # Support passive credential cycling despite pinned dependency 63 | if [ -z "${DATASET_S3_DOWNLOAD_B64_CREDENTIAL:-}" ]; then 64 | curl https://raw.githubusercontent.com/graphcore/graphcore-cloud-tools/main/graphcore_cloud_tools/paperspace_utils/auth.py > tmp_auth.py 65 | DATASET_S3_DOWNLOAD_B64_CREDENTIAL=$(python3 tmp_auth.py) 66 | rm tmp_auth.py 67 | fi 68 | 69 | python -m graphcore_cloud_tools paperspace symlinks --s3-dataset \ 70 | --config-file ${SCRIPT_DIR}/symlink_config.json \ 71 | --gradient-settings-file ${SCRIPT_DIR}/settings.yaml --num-concurrent-downloads 20 --max-concurrency 1 72 | echo "Finished running prepare-datasets.sh" 73 | # Run automated test if specified 74 | if [[ "${1:-}" == 'test' ]]; then 75 | ARGS="${@:2}" 76 | elif [[ "${2:-}" == 'test' ]]; then 77 | ARGS="${@:3}" 78 | fi 79 | [ -n "${ARGS+x}" ] && run-tests $ARGS 80 | 81 | echo "Finished running setup.sh." 82 | -------------------------------------------------------------------------------- /.gradient/settings.yaml: -------------------------------------------------------------------------------- 1 | integrations: 2 | gcl: 3 | type: dataset 4 | ref: paperspace/ds7me5hgjbfht6q:8ngwr2a 5 | magma: 6 | type: dataset 7 | ref: graphcore-managed-s3 8 | poplar-executables-hf-3-3: 9 | type: dataset 10 | ref: paperspace/ds367opyfl97110:be9cyhp 11 | librispeech_asr: 12 | type: dataset 13 | ref: paperspace/ds1uofih1koi71b:xi0qac2 14 | # removing superb while the audio classification nb is removed 15 | # superb: 16 | # type: dataset 17 | # ref: paperspace/dsgrgvk6f7zecuw:fov2xl1 18 | # removing downloads while the audio classification nb is removed 19 | # graphcore-downloads: 20 | # type: dataset 21 | # ref: paperspace/ds52xkj0j1elf02:5nesnjo 22 | dfki-sentinel-eurosat: 23 | type: dataset 24 | ref: paperspace/ds8p6sv96fl1att:k5j4cob 25 | -------------------------------------------------------------------------------- /.gradient/symlink_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "${POPLAR_EXECUTABLE_CACHE_DIR}":["${S3_DATASETS_DIR}/poplar-executables-hf-3-3/${SDK_VERSION}"], 3 | "${HF_DATASETS_CACHE}/librispeech_asr":["${S3_DATASETS_DIR}/librispeech_asr"], 4 | "${DATASETS_DIR}/dfki-sentinel-eurosat":["${S3_DATASETS_DIR}/dfki-sentinel-eurosat"], 5 | "${DATASETS_DIR}/magma":["${S3_DATASETS_DIR}/magma"] 6 | } 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Graphcore Ltd. All rights reserved. 4 | 5 | Copyright (c) 2022 Gradient° 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Training 🤗 Models on IPUs using Paperspace Gradient 2 | 3 | Whether you are looking to generate images with Stable Diffusion, derive insights from text, or need to recognize audio samples, the examples in here have you covered. 4 | 5 | 6 | #### Join our Slack community to interact with other developers! 7 | 8 | [![Join our Slack Community](https://img.shields.io/badge/Slack-Join%20Graphcore's%20Community-blue?style=flat-square&logo=slack)](https://www.graphcore.ai/join-community) 9 | 10 | 11 | ## License 12 | 13 | Unless otherwise specified by a LICENSE file in a subdirectory, the LICENSE referenced at the top level applies to the files in this repository. 14 | 15 | “Jupyter” and the Jupyter logos are trademarks or registered trademarks of NumFOCUS, used by Graphcore with permission. -------------------------------------------------------------------------------- /audio-processing/requirements.txt: -------------------------------------------------------------------------------- 1 | optimum-graphcore==0.7 2 | --find-links https://download.pytorch.org/whl/torch_stable.html 3 | torchaudio == 2.0.2+cpu 4 | librosa 5 | numpy>=1.22 6 | jiwer 7 | soundfile 8 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 9 | -------------------------------------------------------------------------------- /dolly2-instruction-following/api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .pipeline import DollyPipeline 4 | -------------------------------------------------------------------------------- /dolly2-instruction-following/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .config import DollyConfig, Execution 4 | import os 5 | from pathlib import Path 6 | 7 | CONFIG_DIR = Path(os.path.dirname(__file__)) 8 | 9 | del os, Path 10 | -------------------------------------------------------------------------------- /dolly2-instruction-following/config/inference.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | tiny: &tiny 3 | model: 4 | layers: 2 5 | hidden_size: 100 6 | sequence_length: 64 7 | attention: 8 | heads: 4 9 | rotary_dim: 4 10 | embedding: 11 | vocab_size: 150 12 | 13 | dolly: &dolly 14 | model: 15 | layers: 36 16 | hidden_size: 5120 17 | sequence_length: 2048 18 | attention: 19 | heads: 40 20 | rotary_positional_embeddings_base: 10000 21 | rotary_dim: 32 # should be rotary_pct of head dim 22 | embedding: 23 | vocab_size: 50280 24 | # ------------------------- 25 | 26 | # ------- Execution ------- 27 | release: 28 | tiny: 29 | <<: *tiny 30 | execution: 31 | micro_batch_size: 4 32 | available_memory_proportion: [ 0.4 ] 33 | tensor_parallel: 4 34 | 35 | dolly_pod4: 36 | <<: *dolly 37 | execution: 38 | micro_batch_size: 1 39 | available_memory_proportion: [ 0.4 ] 40 | tensor_parallel: 4 41 | 42 | dolly_pod16: 43 | <<: *dolly 44 | execution: 45 | micro_batch_size: 4 46 | available_memory_proportion: [ 0.4 ] 47 | tensor_parallel: 16 48 | attention_tensor_parallel: 8 49 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | 5 | import popxl 6 | from popxl import ops 7 | from popxl.utils import to_numpy 8 | 9 | import popxl_addons as addons 10 | from popxl_addons import NamedTensors 11 | from popxl_addons.layers import LayerNorm 12 | 13 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer as HFModel 14 | 15 | from config import DollyConfig 16 | from .attention import DollySelfAttentionTP 17 | from .feed_forward import DollyFeedForwardTP 18 | 19 | 20 | class DollyDecoderBlockTP(addons.Module): 21 | def __init__(self, config: DollyConfig): 22 | super().__init__() 23 | self.config = config 24 | # begins with identical computations: layer norm ln_1 25 | self.ln_1 = LayerNorm() 26 | self.ln_2 = LayerNorm() 27 | # attention is sharded 28 | # identical computation for bias and skip connection 29 | self.attention = DollySelfAttentionTP(self.config) 30 | # begins with identical computations: layer norm ln_2 31 | # feed forward is sharded 32 | # identical computation for bias, dropout and skip connection 33 | self.feed_forward = DollyFeedForwardTP(self.config) 34 | 35 | def build(self, x: popxl.Tensor): 36 | residual = x 37 | attn_out = self.attention(self.ln_1(x)) 38 | 39 | ff_out = self.feed_forward(self.ln_2(x)) 40 | x = attn_out + ff_out + residual 41 | return x 42 | 43 | @staticmethod 44 | def hf_mapping(config: DollyConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 45 | dtype = config.model.dtype 46 | weights = { 47 | variables.ln_1.weight: to_numpy(hf_model.input_layernorm.weight.data, dtype), 48 | variables.ln_1.bias: to_numpy(hf_model.input_layernorm.bias.data, dtype), 49 | variables.ln_2.weight: to_numpy(hf_model.post_attention_layernorm.weight.data, dtype), 50 | variables.ln_2.bias: to_numpy(hf_model.post_attention_layernorm.bias.data, dtype), 51 | } 52 | weights.update(DollySelfAttentionTP.hf_mapping(config, variables.attention, hf_model.attention)) 53 | weights.update(DollyFeedForwardTP.hf_mapping(config, variables.feed_forward, hf_model.mlp)) 54 | 55 | return weights 56 | 57 | 58 | class DollyDecoderTP(addons.Module): 59 | def __init__(self, config: DollyConfig): 60 | super().__init__() 61 | self.config = config 62 | 63 | def build(self, x: popxl.Tensor): 64 | 65 | facts, graph = DollyDecoderBlockTP(self.config).create_graph(x) # Outline GPT Layer 66 | 67 | for i in range(self.config.model.layers): 68 | args_nt = self.add_variable_inputs(i, facts) 69 | (x,) = graph.bind(args_nt).call(x) 70 | 71 | return x 72 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/dolly_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | from config import DollyConfig 5 | 6 | import popxl 7 | from popxl.utils import to_numpy 8 | 9 | import popxl_addons as addons 10 | from popxl_addons import NamedTensors 11 | 12 | from popxl_addons.layers import LayerNorm 13 | 14 | from .embedding import DollyEmbeddingsTP 15 | from .decoder import DollyDecoderTP, DollyDecoderBlockTP 16 | 17 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel as HFModel 18 | 19 | 20 | class DollyModelTP(addons.Module): 21 | def __init__(self, config: DollyConfig, include_layer_norm=True): 22 | super().__init__() 23 | self.config = config 24 | # sharded, then last bit identical 25 | self.embeddings = DollyEmbeddingsTP(self.config) 26 | # identical inputs, then sharded, then identical 27 | self.decoder = DollyDecoderTP(self.config) 28 | # identical 29 | self.include_layer_norm = include_layer_norm 30 | if self.include_layer_norm: 31 | self.ln_f = LayerNorm() 32 | 33 | def build(self, input_ids: popxl.Tensor): 34 | x = self.embeddings(input_ids) 35 | x = self.decoder(x) 36 | if self.include_layer_norm: 37 | x = self.ln_f(x) 38 | return x 39 | 40 | @staticmethod 41 | def hf_mapping( 42 | config: DollyConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True 43 | ) -> Dict[popxl.Tensor, np.ndarray]: 44 | dtype = config.model.dtype 45 | weights = {} 46 | if layer_norm: 47 | weights = { 48 | variables.ln_f.weight: to_numpy(hf_model.final_layer_norm.weight.data, dtype), 49 | variables.ln_f.bias: to_numpy(hf_model.final_layer_norm.bias.data, dtype), 50 | } 51 | 52 | weights.update(DollyEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model)) 53 | 54 | for l in range(config.model.layers): 55 | weights.update(DollyDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.layers[l])) 56 | 57 | return weights 58 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from typing import Optional, List, Dict 3 | import popxl 4 | from popxl import ops 5 | from popxl.utils import to_numpy 6 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP as HFModel 7 | 8 | import popxl_addons as addons 9 | from config import DollyConfig 10 | from popxl_addons.layers import Linear 11 | import numpy as np 12 | 13 | from popxl_addons.named_tensors import NamedTensors 14 | from popxl_addons.ops.replicated_all_reduce_TP import replicated_all_reduce 15 | from popxl_addons.array_munging import shard 16 | 17 | 18 | class DollyFeedForwardTP(addons.Module): 19 | def __init__(self, config: DollyConfig, ff_size: Optional[int] = None): 20 | super().__init__() 21 | self.config = config 22 | tp = config.execution.tensor_parallel 23 | dp = config.execution.data_parallel 24 | self.n_shards = tp 25 | self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=dp) 26 | # Also known as the intermediate size 27 | self.ff_size = 4 * config.model.hidden_size if ff_size is None else ff_size 28 | assert self.ff_size % self.n_shards == 0 29 | # ----- Layers ----- 30 | # Sharded across devices - column wise 31 | self.intermediate = Linear(self.ff_size // self.n_shards, replica_grouping=self.replica_grouping) 32 | 33 | # Sharded across devices - row wise (bias applied separately) 34 | self.output = Linear(config.model.hidden_size, bias=False, replica_grouping=self.replica_grouping) 35 | 36 | def build(self, x: popxl.Tensor) -> List[popxl.Tensor]: 37 | """Identical input (x, seed) and identical output across shards.""" 38 | # ----- Sharded computation ----- 39 | 40 | # Shard column-wise since gelu is not linear. 41 | # Indeed, sharding row wise requires a sum AllReduce at the end, 42 | # but gelu is not linear: gelu(x+y) != gelu(x) + gelu(y) 43 | z = self.intermediate(x) 44 | z = ops.gelu(z) 45 | # Here, x is already sharded across devices. Since we don't have non linearities, 46 | # we can shard row-wise (which requires both X and the weight matrix to be sharded) 47 | # and then perform an all reduce 48 | z = self.output(z) 49 | 50 | z = replicated_all_reduce(z, group=self.replica_grouping.transpose()) 51 | 52 | # ----- Identical computation ----- 53 | 54 | # Output linear layer bias (identical bias on all devices) 55 | self.bias = self.add_variable_input("bias", lambda: np.zeros(z.shape[-1]), z.dtype) 56 | z = z + self.bias 57 | 58 | return z 59 | 60 | @staticmethod 61 | def hf_mapping(config: DollyConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 62 | dtype = config.model.dtype 63 | n_shards = config.execution.tensor_parallel 64 | 65 | return { 66 | # HF DollyMLP 67 | variables.intermediate.weight: shard( 68 | to_numpy(hf_model.dense_h_to_4h.weight.data.T, dtype), n_shards, axis=-1 69 | ), 70 | variables.intermediate.bias: shard(to_numpy(hf_model.dense_h_to_4h.bias.data, dtype), n_shards, axis=-1), 71 | variables.output.weight: shard(to_numpy(hf_model.dense_4h_to_h.weight.data.T, dtype), n_shards, axis=0), 72 | variables.bias: to_numpy(hf_model.dense_4h_to_h.bias.data, dtype), 73 | } 74 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/hf_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from typing import Dict 4 | import numpy as np 5 | 6 | from transformers.models.gpt_neox import GPTNeoXModel as HFModel 7 | from transformers.models.gpt_neox import GPTNeoXForCausalLM as HFLMHeadModel 8 | 9 | import popxl 10 | from popxl_addons import TaskSession 11 | 12 | from config import DollyConfig 13 | from modelling.dolly_model import DollyModelTP 14 | from modelling.dolly_lm import DollyLMHeadModelTP 15 | 16 | 17 | def hf_mapping_lm_tp( 18 | config: DollyConfig, session: TaskSession, pretrained: HFLMHeadModel 19 | ) -> Dict[popxl.Tensor, np.ndarray]: 20 | load_to = session.state 21 | if "fwd" in session.state: 22 | load_to = session.state.fwd 23 | weights = DollyLMHeadModelTP.hf_mapping(config, load_to, pretrained) 24 | return weights 25 | 26 | 27 | def hf_mapping_TP(config: DollyConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 28 | load_to = session.state 29 | if "fwd" in session.state: 30 | load_to = session.state.fwd 31 | weights = DollyModelTP.hf_mapping(config, load_to, pretrained) 32 | return weights 33 | 34 | 35 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel: 36 | weights = session.get_named_tensors_data() 37 | if "fwd" in weights: 38 | weights = weights.fwd 39 | state_dict = DollyLMHeadModelTP.to_hf(weights, hf_model) 40 | # check only missing keys are mask-related keys 41 | hf_state_keys = hf_model.state_dict().keys() 42 | popxl_keys = state_dict.keys() 43 | 44 | def should_check(k: str): 45 | return "attn.bias" not in k and "attn.masked_bias" not in k 46 | 47 | for k in hf_state_keys: 48 | if should_check(k) and k not in popxl_keys: 49 | raise KeyError(f"key {k} not found in session state") 50 | 51 | hf_model.load_state_dict(state_dict, strict=False) 52 | return hf_model 53 | 54 | 55 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel: 56 | weights = session.get_named_tensors_data() 57 | if "fwd" in weights: 58 | weights = weights.fwd 59 | 60 | state_dict = DollyModelTP.to_hf(weights, hf_model) 61 | # check only missing keys are mask-related keys 62 | hf_state_keys = hf_model.state_dict().keys() 63 | popxl_keys = state_dict.keys() 64 | 65 | def should_check(k: str): 66 | return "attn.bias" not in k and "attn.masked_bias" not in k 67 | 68 | for k in hf_state_keys: 69 | if should_check(k) and k not in popxl_keys: 70 | raise KeyError(f"key {k} not found in session state") 71 | 72 | hf_model.load_state_dict(state_dict, strict=False) 73 | return hf_model 74 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .rotary_pos_embed import * 3 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/common.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_ROTARYPOSEMBED_OPIDS 3 | #define GUARD_ROTARYPOSEMBED_OPIDS 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using InMapType = std::map; 11 | using OutMapType = std::map; 12 | using OutIndex = int; 13 | 14 | namespace popart { 15 | 16 | #define CUSTOM_OP_DOMAIN "popxl.addons.ops" 17 | 18 | const popart::OperatorIdentifier RotaryPosEmbed = OperatorIdentifier{ 19 | CUSTOM_OP_DOMAIN, 20 | "RotaryPosEmbed", 21 | 1, // Op version 22 | {3, 3}, // number of inputs 23 | 1 // number of outputs 24 | }; 25 | 26 | const popart::OperatorIdentifier RotaryPosEmbedGrad = OperatorIdentifier{ 27 | CUSTOM_OP_DOMAIN, 28 | "RotaryPosEmbedGrad", 29 | 1, // Op version 30 | {3, 3}, // number of inputs 31 | 1 // number of outputs 32 | }; 33 | 34 | } // namespace popart 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "common.hpp" 19 | #include "rotary_pos_embed.hpp" 20 | 21 | namespace popart { 22 | 23 | ///////////////////////////////////////////////////////////// 24 | ////// Fwd op 25 | 26 | RotaryPosEmbedOp::RotaryPosEmbedOp(const OperatorIdentifier &_opid, 27 | uint32_t rotary_dim_, 28 | const Op::Settings &settings_) 29 | : Op(_opid, settings_), rotary_dim{rotary_dim_} { 30 | if ((rotary_dim % 2) != 0) { 31 | throw error("RotaryPosEmbedOp::RotaryPosEmbedOp rotary_dim must be a " 32 | "multiple of 2"); 33 | } 34 | } 35 | 36 | std::unique_ptr RotaryPosEmbedOp::clone() const { 37 | return std::make_unique(*this); 38 | } 39 | 40 | std::vector> RotaryPosEmbedOp::getGradOps() { 41 | std::vector> result; 42 | result.push_back(std::make_unique(*this)); 43 | return result; 44 | } 45 | 46 | void RotaryPosEmbedOp::setup() { 47 | auto xInfo = inInfo(0); 48 | auto cosInfo = inInfo(1); 49 | auto sinInfo = inInfo(2); 50 | 51 | // check expected shapes 52 | if (xInfo.rank() != 4) { 53 | throw error( 54 | "RotaryPosEmbedOp::setup x should have rank 4 (batch, heads, seq, hh)"); 55 | } 56 | if (cosInfo.rank() != 3 || sinInfo.rank() != 3) { 57 | throw error("RotaryPosEmbedOp::setup trig functions should have rank 3 " 58 | "(1 or batch, seq, hh/2)"); 59 | } 60 | if ((rotary_dim % 2) != 0) { 61 | throw error("RotaryPosEmbedOp::setup rotary dim must be a multiple of 2"); 62 | } 63 | 64 | // x rotated 65 | outInfo(0) = xInfo; 66 | } 67 | 68 | void RotaryPosEmbedOp::appendOutlineAttributes(OpSerialiserBase &os) const { 69 | os.appendAttribute("rotary_dim", rotary_dim); 70 | Op::appendOutlineAttributes(os); 71 | } 72 | 73 | ///////////////////////////////////////////////////////////// 74 | ////// Grad op 75 | 76 | RotaryPosEmbedGradOp::RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op) 77 | : Op(RotaryPosEmbedGrad, op.getSettings()), rotary_dim{op.rotary_dim} {} 78 | 79 | const std::map &RotaryPosEmbedGradOp::gradOutToNonGradIn() const { 80 | static const std::map outInfo = {{0, 0}}; 81 | return outInfo; 82 | } 83 | 84 | const std::vector & 85 | RotaryPosEmbedGradOp::gradInputInfo() const { 86 | static const std::vector inInfo = { 87 | {0, 0, GradOpInType::GradOut}, 88 | {1, 1, GradOpInType::In}, 89 | {2, 2, GradOpInType::In}}; 90 | return inInfo; 91 | } 92 | 93 | void RotaryPosEmbedGradOp::setup() { outInfo(0) = inInfo(0); } 94 | 95 | std::unique_ptr RotaryPosEmbedGradOp::clone() const { 96 | return std::make_unique(*this); 97 | } 98 | 99 | void RotaryPosEmbedGradOp::appendOutlineAttributes(OpSerialiserBase &os) const { 100 | os.appendAttribute("rotary_dim", rotary_dim); 101 | Op::appendOutlineAttributes(os); 102 | } 103 | 104 | } // namespace popart 105 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_NEURALNET_STRIDEDSLICE_HPP 3 | #define GUARD_NEURALNET_STRIDEDSLICE_HPP 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "common.hpp" 10 | 11 | namespace popart { 12 | 13 | class RotaryPosEmbedOp : public Op { 14 | public: 15 | RotaryPosEmbedOp(const OperatorIdentifier &_opid, uint32_t rotary_dim_, 16 | const Op::Settings &settings_); 17 | 18 | std::unique_ptr clone() const override; 19 | std::vector> getGradOps() override; 20 | void setup() final; 21 | 22 | float getSubgraphValue() const override { return getHighSubgraphValue(); } 23 | 24 | static RotaryPosEmbedOp * 25 | createOpInGraph(popart::Graph &graph, const InMapType &in, 26 | const OutMapType &out, uint32_t rotary_dim_, 27 | const popart::Op::Settings &settings) { 28 | return graph.createConnectedOp(in, out, RotaryPosEmbed, 29 | rotary_dim_, settings); 30 | } 31 | 32 | void appendOutlineAttributes(OpSerialiserBase &) const override; 33 | 34 | uint32_t rotary_dim = 0; 35 | }; 36 | 37 | class RotaryPosEmbedGradOp : public Op { 38 | public: 39 | RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op); 40 | 41 | void setup() final; 42 | std::unique_ptr clone() const override; 43 | const std::vector &gradInputInfo() const final; 44 | const std::map &gradOutToNonGradIn() const final; 45 | 46 | float getSubgraphValue() const override { return getHighSubgraphValue(); } 47 | 48 | void appendOutlineAttributes(OpSerialiserBase &) const override; 49 | 50 | uint32_t rotary_dim = 0; 51 | }; 52 | 53 | } // namespace popart 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpp: -------------------------------------------------------------------------------- 1 | // cppimport 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "common.hpp" 32 | #include "rotary_pos_embed.hpp" 33 | #include "rotary_pos_embedx.hpp" 34 | 35 | namespace py = pybind11; 36 | 37 | // -------------- PyBind -------------- 38 | // `rotary_pos_embed_binding` must equal filename 39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) { 40 | // Bindings the parameters of the op: constructor + fields. 41 | py::class_> 43 | binding(m, "RotaryPosEmbedOp"); 44 | binding.def_static( 45 | "createOpInGraph", 46 | py::overload_cast( 48 | &popart::RotaryPosEmbedOp::createOpInGraph), 49 | py::arg("graph"), py::arg("inputs"), py::arg("outputs"), 50 | py::arg("rotaryDim"), py::arg("settings"), 51 | py::return_value_policy::reference); 52 | binding.def("outTensor", 53 | py::overload_cast(&popart::RotaryPosEmbedOp::outTensor), 54 | py::return_value_policy::reference); 55 | }; 56 | 57 | // -------------- cppimport -------------- 58 | // cppimport configuration for compiling the pybind11 module. 59 | // clang-format off 60 | /* 61 | <% 62 | cfg['sources'] = ['rotary_pos_embed.cpp', 'rotary_pos_embedx.cpp'] 63 | cfg['extra_compile_args'] = ['-std=c++14', '-fPIC', '-O2', '-DONNX_NAMESPACE=onnx', '-Wall', '-Wno-sign-compare'] 64 | cfg['libraries'] = ['popart', 'poputil', 'popops', 'poplin', 'popnn', 'poprand', 'gcl'] 65 | setup_pybind11(cfg) 66 | %> 67 | */ 68 | -------------------------------------------------------------------------------- /dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embedx.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP 3 | #define GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace popart { 10 | namespace popx { 11 | 12 | class RotaryPosEmbedOpx : public Opx { 13 | public: 14 | RotaryPosEmbedOpx(Op *, Devicex *); 15 | 16 | void grow(poplar::program::Sequence &) const; 17 | }; 18 | 19 | class RotaryPosEmbedGradOpx : public Opx { 20 | public: 21 | RotaryPosEmbedGradOpx(Op *, Devicex *); 22 | 23 | void grow(poplar::program::Sequence &) const; 24 | }; 25 | 26 | } // namespace popx 27 | } // namespace popart 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /dolly2-instruction-following/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy==1.10.1 3 | 4 | 5 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0 6 | transformers 7 | tokenizers==0.13.3 8 | accelerate==0.20.3 9 | 10 | pytest==6.2.5 11 | pytest-pythonpath==0.7.4 12 | 13 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 14 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils@v3.2 15 | 16 | -f https://download.pytorch.org/whl/torch_stable.html 17 | torch==2.0.1+cpu 18 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import os 3 | from config import DollyConfig 4 | 5 | import pytest 6 | 7 | from config import DollyConfig 8 | from utils.simple_parsing_tools import parse_args_with_config_file 9 | 10 | 11 | def _test_config_file(): 12 | return os.path.join(os.path.dirname(__file__), "test_config.yml") 13 | 14 | 15 | @pytest.fixture 16 | def test_config_file(): 17 | return _test_config_file() 18 | 19 | 20 | @pytest.fixture 21 | def test_config(): 22 | return parse_args_with_config_file(DollyConfig, ["--config", _test_config_file()]) 23 | 24 | 25 | # Below functions enable long tests to be skipped, unless a --long-test 26 | # cli option is specified. 27 | def pytest_addoption(parser): 28 | parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests") 29 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/execution/test_execution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | from examples_tests.test_util import SubProcessChecker 7 | 8 | root_dir = Path(__file__).parent.parent.parent.parent.resolve() 9 | 10 | 11 | def dolly_root_env_path(): 12 | env = os.environ 13 | env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir))) 14 | return env 15 | 16 | 17 | class TestExecution(SubProcessChecker): 18 | def test_inference(self): 19 | self.run_command( 20 | "python3 inference.py --config tiny --layers 2 " 21 | "--tensor_parallel 4 " 22 | "--vocab_size 128 --sequence_length 16 " 23 | "--hidden_size 128 --heads 8", 24 | root_dir, 25 | ["Duration"], 26 | env=dolly_root_env_path(), 27 | ) 28 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/layers/test_attention_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.gpt_neox import GPTNeoXConfig as HFConfig 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXAttention 8 | 9 | import popxl 10 | 11 | import popxl_addons as addons 12 | from popxl_addons.patterns import apply_pre_alias_patterns 13 | from config import DollyConfig 14 | from modelling.attention import DollySelfAttentionTP 15 | from popxl_addons.array_munging import repeat 16 | 17 | 18 | def test_attention_TP_cmp_huggingface(test_config: DollyConfig): 19 | torch.manual_seed(42) 20 | 21 | batch_size = test_config.execution.micro_batch_size 22 | seq_len = test_config.model.sequence_length 23 | hidden_size = test_config.model.hidden_size 24 | intermediate_size = hidden_size * 4 25 | 26 | # HuggingFace 27 | config = HFConfig( 28 | hidden_size=hidden_size, 29 | max_position_embeddings=seq_len, 30 | intermediate_size=intermediate_size, 31 | num_attention_heads=test_config.model.attention.heads, 32 | rotary_dim=test_config.model.attention.rotary_dim, 33 | ) 34 | hf_model = GPTNeoXAttention(config).eval() 35 | 36 | # HF forward 37 | input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True) 38 | output_, *_ = hf_model(input_t, None) 39 | output_HF = output_.detach().numpy() 40 | 41 | # TP 42 | n_shards = test_config.execution.tensor_parallel 43 | 44 | # popxl 45 | ir = popxl.Ir() 46 | ir.replication_factor = n_shards 47 | with ir.main_graph: 48 | inputs_data, inputs_host_steam, inputs_tensors = zip( 49 | *[ 50 | addons.host_load( 51 | input_t.reshape(-1, test_config.model.hidden_size), test_config.model.dtype, name="input" 52 | ), 53 | ] 54 | ) 55 | (x,) = inputs_tensors 56 | 57 | attn_args, attn_graph = DollySelfAttentionTP(test_config).create_graph(x) 58 | 59 | vars = attn_args.init() 60 | fwd_info = attn_graph.bind(vars).call_with_info(x) 61 | (acts,) = fwd_info.outputs 62 | 63 | fwd_d2h = addons.host_store(acts) 64 | 65 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 66 | apply_pre_alias_patterns(ir, level="default") 67 | 68 | weights = DollySelfAttentionTP.hf_mapping(test_config, vars, hf_model) 69 | 70 | inputs = {h2d: repeat(data, n_shards).squeeze() for h2d, data in zip(inputs_host_steam, inputs_data)} 71 | 72 | with popxl.Session(ir, "ipu_hw") as session: 73 | session.write_variables_data(weights) 74 | outputs_popxl = session.run(inputs) 75 | 76 | fwd_data = outputs_popxl[fwd_d2h] 77 | 78 | if n_shards > 1: 79 | assert len(fwd_data) == n_shards 80 | 81 | # Assert all IPU outputs are identical 82 | for i in range(1, n_shards): 83 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 84 | else: 85 | fwd_data = np.expand_dims(fwd_data, axis=0) 86 | 87 | # Assert nearly equal to HF 88 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 4) 89 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/layers/test_decoder_block_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer 8 | 9 | import popxl 10 | from popxl.utils import to_numpy 11 | 12 | import popxl_addons as addons 13 | from popxl_addons.patterns import apply_pre_alias_patterns 14 | 15 | from config import DollyConfig 16 | from modelling.decoder import DollyDecoderBlockTP 17 | from popxl_addons.array_munging import repeat 18 | 19 | 20 | def test_decoder_block_TP_cmp_huggingface(test_config: DollyConfig): 21 | torch.manual_seed(42) 22 | 23 | batch_size = test_config.execution.micro_batch_size 24 | seq_len = test_config.model.sequence_length 25 | hidden_size = test_config.model.hidden_size 26 | intermediate_size = hidden_size * 4 27 | 28 | # HuggingFace 29 | config = HFConfig( 30 | hidden_size=hidden_size, 31 | max_position_embeddings=seq_len, 32 | intermediate_size=intermediate_size, 33 | num_attention_heads=test_config.model.attention.heads, 34 | rotary_dim=test_config.model.attention.rotary_dim, 35 | use_parallel_residual=True, 36 | ) 37 | hf_model = GPTNeoXLayer(config).eval() 38 | 39 | # HF forward 40 | input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True) 41 | (output_,) = hf_model(input_t) 42 | 43 | output_HF = output_.detach().numpy() 44 | 45 | # TP 46 | n_shards = test_config.execution.tensor_parallel 47 | test_config.execution.tensor_parallel = n_shards 48 | 49 | # popxl 50 | ir = popxl.Ir() 51 | ir.replication_factor = n_shards 52 | 53 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 54 | 55 | main = ir.main_graph 56 | 57 | with main: 58 | inputs_data, inputs_host_steam, inputs_tensors = zip( 59 | *[ 60 | addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"), 61 | ] 62 | ) 63 | (x,) = inputs_tensors 64 | 65 | args, graph = DollyDecoderBlockTP(test_config).create_graph(x) 66 | 67 | ff_vars = args.init() 68 | ff = graph.bind(ff_vars) 69 | fwd_info = ff.call_with_info(x) 70 | (acts,) = fwd_info.outputs 71 | 72 | fwd_d2h = addons.host_store(acts) 73 | 74 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 75 | apply_pre_alias_patterns(ir, level="default") 76 | 77 | weights = DollyDecoderBlockTP.hf_mapping(test_config, ff_vars, hf_model) 78 | 79 | inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)} 80 | 81 | with popxl.Session(ir, "ipu_hw") as session: 82 | session.write_variables_data(weights) 83 | outputs_popxl = session.run(inputs) 84 | 85 | fwd_data = outputs_popxl[fwd_d2h] 86 | 87 | assert len(fwd_data) == n_shards 88 | 89 | # Assert all IPU outputs are identical 90 | for i in range(1, n_shards): 91 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 92 | # Assert nearly equal to HF 93 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 94 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/layers/test_feed_forward_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP 8 | 9 | import popxl 10 | 11 | import popxl_addons as addons 12 | from popxl_addons.patterns import apply_pre_alias_patterns 13 | 14 | from config import DollyConfig 15 | from modelling.feed_forward import DollyFeedForwardTP 16 | from popxl_addons.array_munging import repeat 17 | 18 | 19 | def test_feed_forward_TP_cmp_huggingface(test_config: DollyConfig): 20 | torch.manual_seed(42) 21 | 22 | batch_size = test_config.execution.micro_batch_size 23 | seq_len = test_config.model.sequence_length 24 | hidden_size = test_config.model.hidden_size 25 | intermediate_size = hidden_size * 4 26 | 27 | # HuggingFace 28 | config = HFConfig( 29 | hidden_size=hidden_size, 30 | max_position_embeddings=seq_len, 31 | intermediate_size=intermediate_size, 32 | num_attention_heads=test_config.model.attention.heads, 33 | ) 34 | hf_model = GPTNeoXMLP(config).eval() 35 | 36 | # HF forward 37 | input_t = torch.rand((batch_size, seq_len, hidden_size)) 38 | outputs = hf_model(input_t) 39 | output_ = outputs.reshape(batch_size * seq_len, hidden_size) 40 | output_HF = output_.detach().numpy() 41 | 42 | # TP 43 | n_shards = test_config.execution.tensor_parallel 44 | 45 | # popxl 46 | ir = popxl.Ir() 47 | ir.replication_factor = n_shards 48 | 49 | main = ir.main_graph 50 | 51 | with main: 52 | inputs_data, inputs_host_steam, inputs_tensors = zip( 53 | *[ 54 | addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"), 55 | ] 56 | ) 57 | (x,) = inputs_tensors 58 | 59 | ff_args, ff_graph = DollyFeedForwardTP(test_config).create_graph(x) 60 | 61 | ff_vars = ff_args.init() 62 | ff = ff_graph.bind(ff_vars) 63 | fwd_info = ff.call_with_info(x) 64 | (acts,) = fwd_info.outputs 65 | 66 | fwd_d2h = addons.host_store(acts) 67 | 68 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 69 | apply_pre_alias_patterns(ir, level="default") 70 | 71 | weights = DollyFeedForwardTP.hf_mapping(test_config, ff_vars, hf_model) 72 | 73 | inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)} 74 | with popxl.Session(ir, "ipu_hw") as session: 75 | session.write_variables_data(weights) 76 | outputs_popxl = session.run(inputs) 77 | 78 | fwd_data = outputs_popxl[fwd_d2h] 79 | 80 | assert len(fwd_data) == n_shards 81 | 82 | # Assert all IPU outputs are identical 83 | for i in range(1, n_shards): 84 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 85 | # Assert nearly equal to HF 86 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 87 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/layers/test_lm_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.gpt_neox import GPTNeoXConfig as HFConfig 7 | from transformers.models.gpt_neox import GPTNeoXForCausalLM 8 | 9 | import popxl 10 | from popxl.utils import to_numpy 11 | 12 | import popxl_addons as addons 13 | from popxl_addons.patterns import apply_pre_alias_patterns 14 | 15 | from config import DollyConfig 16 | from modelling.embedding import DollyEmbeddingsTP 17 | from modelling.dolly_lm import DollyLMHeadModelTP 18 | 19 | from popxl_addons.array_munging import shard 20 | 21 | 22 | def test_lm_TP_cmp_huggingface(test_config: DollyConfig): 23 | torch.manual_seed(42) 24 | batch_size = test_config.execution.micro_batch_size 25 | hidden_size = test_config.model.hidden_size 26 | intermediate_size = hidden_size * 4 27 | seq_len = test_config.model.sequence_length 28 | # HuggingFace 29 | config = HFConfig( 30 | num_hidden_layers=test_config.model.layers, 31 | vocab_size=test_config.model.embedding.vocab_size, 32 | hidden_size=hidden_size, 33 | max_position_embeddings=seq_len, 34 | intermediate_size=intermediate_size, 35 | num_attention_heads=test_config.model.attention.heads, 36 | rotary_dim=test_config.model.attention.rotary_dim, 37 | ) 38 | hf_model = GPTNeoXForCausalLM(config).eval() 39 | 40 | # HF forward 41 | input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length)) 42 | output_HF = hf_model(input_ids=input_t)[0] 43 | output_HF = output_HF.detach().numpy() 44 | 45 | # n_shards 46 | n_shards = test_config.execution.tensor_parallel 47 | 48 | # Offset inputs 49 | words_offsetted = DollyEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t)) 50 | # popxl 51 | ir = popxl.Ir() 52 | ir.replication_factor = n_shards 53 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 54 | main = ir.main_graph 55 | 56 | with main: 57 | inputs_data, inputs_host_steam, inputs_tensors = zip( 58 | *[ 59 | addons.host_load(words_offsetted[0], popxl.int32, name="words"), 60 | ] 61 | ) 62 | (words,) = inputs_tensors 63 | facts, graph = DollyLMHeadModelTP(test_config).create_graph(words) 64 | vars = facts.init() 65 | gpt = graph.bind(vars) 66 | call_info = gpt.call_with_info(words) 67 | act, *_ = call_info.outputs 68 | act_stream = addons.host_store(act) 69 | 70 | apply_pre_alias_patterns(ir, level="default") 71 | 72 | # Map weights from huggingface 73 | weights = DollyLMHeadModelTP.hf_mapping(test_config, vars, hf_model) 74 | 75 | inputs = dict(zip(inputs_host_steam, [words_offsetted])) 76 | 77 | ir.num_host_transfers = test_config.execution.device_iterations 78 | 79 | with popxl.Session(ir, "ipu_hw") as session: 80 | session.write_variables_data(weights) 81 | outs = session.run(inputs) 82 | 83 | # Fwd output 84 | fwd_data = outs[act_stream] 85 | assert len(fwd_data) == n_shards 86 | fwd_data_full = np.concatenate(fwd_data, axis=-1)[:, : test_config.model.embedding.vocab_size] 87 | np.testing.assert_almost_equal(output_HF, fwd_data_full.reshape(output_HF.shape), 3) 88 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/integration/layers/test_model_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel 8 | 9 | import popxl 10 | from popxl.utils import to_numpy 11 | 12 | import popxl_addons as addons 13 | from popxl_addons.patterns import apply_pre_alias_patterns 14 | 15 | from config import DollyConfig 16 | from modelling.embedding import DollyEmbeddingsTP 17 | from modelling.dolly_model import DollyModelTP 18 | 19 | 20 | def test_model_TP_cmp_huggingface(test_config: DollyConfig): 21 | torch.manual_seed(42) 22 | 23 | batch_size = test_config.execution.micro_batch_size 24 | hidden_size = test_config.model.hidden_size 25 | seq_len = test_config.model.sequence_length 26 | intermediate_size = hidden_size * 4 27 | # HuggingFace 28 | config = HFConfig( 29 | num_hidden_layers=test_config.model.layers, 30 | vocab_size=test_config.model.embedding.vocab_size, 31 | hidden_size=hidden_size, 32 | max_position_embeddings=seq_len, 33 | intermediate_size=intermediate_size, 34 | num_attention_heads=test_config.model.attention.heads, 35 | rotary_dim=test_config.model.attention.rotary_dim, 36 | ) 37 | hf_model = GPTNeoXModel(config).eval() 38 | 39 | # HF forward 40 | input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length)) 41 | output_HF = hf_model(input_ids=input_t)[0] 42 | output_HF = output_HF.detach().numpy() 43 | 44 | # TP 45 | tp = test_config.execution.tensor_parallel 46 | 47 | # Offset inputs 48 | words_offsetted = DollyEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t)) 49 | 50 | # popxl 51 | ir = popxl.Ir() 52 | ir.replication_factor = tp 53 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 54 | main = ir.main_graph 55 | 56 | with main: 57 | inputs_data, inputs_host_steam, inputs_tensors = zip( 58 | *[ 59 | addons.host_load(words_offsetted[0], popxl.int32, name="words"), 60 | ] 61 | ) 62 | (words,) = inputs_tensors 63 | facts, graph = DollyModelTP(test_config).create_graph(words) 64 | 65 | vars = facts.init() 66 | gpt = graph.bind(vars) 67 | call_info = gpt.call_with_info(words) 68 | act, *_ = call_info.outputs 69 | act_stream = addons.host_store(act) 70 | 71 | apply_pre_alias_patterns(ir, level="default") 72 | 73 | # Map weights from huggingface 74 | weights = DollyModelTP.hf_mapping(test_config, vars, hf_model) 75 | 76 | inputs = dict(zip(inputs_host_steam, [words_offsetted])) 77 | 78 | ir.num_host_transfers = test_config.execution.device_iterations 79 | 80 | with popxl.Session(ir, "ipu_hw") as session: 81 | session.write_variables_data(weights) 82 | outs = session.run(inputs) 83 | 84 | # Fwd output 85 | fwd_data = outs[act_stream] 86 | 87 | assert len(fwd_data) == tp 88 | for i in range(1, tp): 89 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 90 | 91 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 92 | -------------------------------------------------------------------------------- /dolly2-instruction-following/tests/test_config.yml: -------------------------------------------------------------------------------- 1 | model: 2 | sequence_length: 256 # 8 3 | embedding: 4 | vocab_size: 128 5 | hidden_size: 128 6 | layers: 2 7 | attention: 8 | heads: 4 9 | rotary_dim: 8 10 | precision: "float32" 11 | execution: 12 | micro_batch_size: 1 13 | data_parallel: 1 14 | tensor_parallel: 4 15 | -------------------------------------------------------------------------------- /dolly2-instruction-following/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /dolly2-instruction-following/utils/simple_parsing_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from examples_utils.parsing.simple_parsing_tools import * 4 | -------------------------------------------------------------------------------- /gptj-text-generation/api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from utils.trainer import GPTJTrainer 4 | from utils.pipeline import GPTJPipeline, GPTJEntailmentPipeline 5 | -------------------------------------------------------------------------------- /gptj-text-generation/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | 3 | from .config import GPTJConfig, Execution 4 | import os 5 | from pathlib import Path 6 | 7 | CONFIG_DIR = Path(os.path.dirname(__file__)) 8 | 9 | del os, Path 10 | -------------------------------------------------------------------------------- /gptj-text-generation/config/finetuning.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | "gptj_6B_1024": &gptj_6B_1024 3 | model: 4 | sequence_length: 1024 5 | layers: 28 6 | hidden_size: 4096 7 | dropout_prob: 0.0 8 | attention: 9 | heads: 16 10 | rotary_positional_embeddings_base: 10000 11 | rotary_dim: 64 12 | embedding: 13 | vocab_size: 50400 14 | training: 15 | global_batch_size: 128 16 | steps: 500 17 | optimizer: 18 | optimizer: adamw 19 | learning_rate: 20 | maximum: 5e-06 21 | warmup_proportion: 0.005995 22 | weight_decay: 0.0 23 | 24 | tiny: &tiny 25 | model: 26 | sequence_length: 8 27 | embedding: 28 | vocab_size: 128 29 | hidden_size: 64 30 | layers: 2 31 | attention: 32 | heads: 4 33 | rotary_dim: 16 34 | training: 35 | global_batch_size: 16 36 | steps: 100000 37 | optimizer: 38 | optimizer: adamw 39 | learning_rate: 40 | maximum: 0.00001 41 | warmup_proportion: 0.00625 42 | weight_decay: 0.01 43 | 44 | # ------------------------- 45 | 46 | 47 | # ------- Execution ------- 48 | release: 49 | "gptj_6B_1024_pod64": 50 | <<: *gptj_6B_1024 51 | execution: 52 | micro_batch_size: 1 53 | loss_scaling: 4096 54 | io_tiles: 128 55 | data_parallel: 4 56 | tensor_parallel: 16 57 | available_memory_proportion: [ 0.2 ] 58 | attention_serialisation: 2 59 | 60 | "gptj_6B_1024_pod16": 61 | <<: *gptj_6B_1024 62 | execution: 63 | micro_batch_size: 1 64 | loss_scaling: 4096 65 | io_tiles: 128 66 | data_parallel: 1 67 | tensor_parallel: 16 68 | available_memory_proportion: [ 0.2 ] 69 | attention_serialisation: 2 70 | 71 | tiny: 72 | <<: *tiny 73 | execution: 74 | io_tiles: 64 75 | micro_batch_size: 1 76 | data_parallel: 2 77 | tensor_parallel: 4 78 | attention_serialisation: 2 79 | -------------------------------------------------------------------------------- /gptj-text-generation/config/inference.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | tiny: &tiny 3 | model: 4 | eval: true 5 | layers: 2 6 | hidden_size: 64 7 | sequence_length: 8 8 | attention: 9 | heads: 4 10 | rotary_dim: 16 11 | embedding: 12 | vocab_size: 128 13 | 14 | gpt-j: &gpt-j 15 | model: 16 | eval: true 17 | layers: 28 18 | hidden_size: 4096 19 | sequence_length: 1024 20 | attention: 21 | heads: 16 22 | rotary_positional_embeddings_base: 10000 23 | rotary_dim: 64 24 | embedding: 25 | vocab_size: 50400 26 | # ------------------------- 27 | 28 | # ------- Execution ------- 29 | release: 30 | tiny: 31 | <<: *tiny 32 | execution: 33 | micro_batch_size: 1 34 | available_memory_proportion: [ 0.4 ] 35 | tensor_parallel: 4 36 | 37 | gpt-j: 38 | <<: *gpt-j 39 | execution: 40 | micro_batch_size: 12 41 | available_memory_proportion: [ 0.4 ] 42 | tensor_parallel: 4 43 | 44 | gpt-j-gq-4bit: 45 | <<: *gpt-j 46 | execution: 47 | micro_batch_size: 12 48 | available_memory_proportion: [ 0.4 ] 49 | tensor_parallel: 4 50 | group_quantise_weights: 64 51 | 52 | gpt-j-mnli: 53 | <<: *gpt-j 54 | execution: 55 | micro_batch_size: 16 56 | available_memory_proportion: [ 0.4 ] 57 | tensor_parallel: 4 58 | 59 | gpt-j-mnli-gq-4bit: 60 | <<: *gpt-j 61 | execution: 62 | micro_batch_size: 16 63 | available_memory_proportion: [ 0.4 ] 64 | tensor_parallel: 4 65 | group_quantise_weights: 64 66 | -------------------------------------------------------------------------------- /gptj-text-generation/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /gptj-text-generation/data/hf_data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | # 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # This file has been modified by Graphcore Ltd. 18 | 19 | from itertools import chain 20 | from config import GPTJConfig 21 | 22 | 23 | def group_texts(config: GPTJConfig): 24 | seq_len_1 = config.model.sequence_length + 1 25 | 26 | def func(examples): 27 | # Concatenate all texts. 28 | inputs = list(chain(*examples["input_ids"])) 29 | total_length = len(inputs) 30 | # We drop the small remainder instead of padding 31 | if total_length >= seq_len_1: 32 | total_length = (total_length // seq_len_1) * seq_len_1 33 | # Split by chunks of max_len. 34 | data = [inputs[i : i + seq_len_1] for i in range(0, total_length, seq_len_1)] 35 | result = { 36 | "input_ids": [d[:-1] for d in data], 37 | "labels": [d[1:] for d in data], 38 | } 39 | return result 40 | 41 | return func 42 | -------------------------------------------------------------------------------- /gptj-text-generation/imgs/bs_buffers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/bs_buffers.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/data_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/data_parallelism.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/dp_tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/dp_tp.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/execution.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/execution.jpg -------------------------------------------------------------------------------- /gptj-text-generation/imgs/gq-speed-accuracy-tradeoff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/gq-speed-accuracy-tradeoff.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/mnli_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/mnli_dataset.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/rts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/rts.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/tensor_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tensor_parallelism.png -------------------------------------------------------------------------------- /gptj-text-generation/imgs/tp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tp.jpg -------------------------------------------------------------------------------- /gptj-text-generation/imgs/tp_dp_rts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tp_dp_rts.png -------------------------------------------------------------------------------- /gptj-text-generation/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /gptj-text-generation/modelling/gptj_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | from config import GPTJConfig 5 | import torch 6 | 7 | import popxl 8 | from popxl.utils import to_numpy 9 | 10 | import popxl_addons as addons 11 | from popxl_addons import NamedTensors 12 | from popxl_addons.named_tensors import NamedTensorData 13 | 14 | from popxl_addons.layers import LayerNorm 15 | 16 | from .embedding import GPTJEmbeddingsTP 17 | from .decoder import GPTJDecoderTP, GPTJDecoderBlockTP 18 | 19 | from transformers.models.gptj.modeling_gptj import GPTJModel as HFModel 20 | from transformers.models.gptj.configuration_gptj import GPTJConfig as GPTJConfigHF 21 | 22 | 23 | class GPTJModelTP(addons.Module): 24 | def __init__(self, config: GPTJConfig, include_layer_norm=True): 25 | super().__init__() 26 | self.config = config 27 | # sharded, then last bit identical 28 | self.embeddings = GPTJEmbeddingsTP(self.config) 29 | # identical inputs, then sharded, then identical 30 | self.decoder = GPTJDecoderTP(self.config) 31 | # identical 32 | self.include_layer_norm = include_layer_norm 33 | if self.include_layer_norm: 34 | self.ln_f = LayerNorm() 35 | 36 | def build(self, input_ids: popxl.Tensor): 37 | x = self.embeddings(input_ids) 38 | x = self.decoder(x) 39 | if self.include_layer_norm: 40 | x = self.ln_f(x) 41 | return x 42 | 43 | @staticmethod 44 | def hf_mapping( 45 | config: GPTJConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True 46 | ) -> Dict[popxl.Tensor, np.ndarray]: 47 | dtype = config.model.dtype 48 | weights = {} 49 | if layer_norm: 50 | weights = { 51 | variables.ln_f.weight: to_numpy(hf_model.ln_f.weight.data, dtype), 52 | variables.ln_f.bias: to_numpy(hf_model.ln_f.bias.data, dtype), 53 | } 54 | 55 | weights.update(GPTJEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model)) 56 | 57 | for l in range(config.model.layers): 58 | weights.update(GPTJDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.h[l])) 59 | 60 | return weights 61 | 62 | @staticmethod 63 | def to_hf(variables_data: NamedTensorData, hf_model: HFModel, layer_norm=True) -> Dict[str, torch.Tensor]: 64 | state_dict = {} 65 | if layer_norm: 66 | state_dict["ln_f.weight"] = torch.tensor(variables_data.ln_f.weight, dtype=hf_model.config.torch_dtype) 67 | state_dict["ln_f.bias"] = torch.tensor(variables_data.ln_f.bias, dtype=hf_model.config.torch_dtype) 68 | 69 | state_dict.update(GPTJEmbeddingsTP.to_hf(hf_model.config, variables_data.embeddings, hf_model.wte)) 70 | for l in range(hf_model.config.n_layer): 71 | state_dict.update( 72 | { 73 | "h." + str(l) + "." + k: v 74 | for k, v in GPTJDecoderBlockTP.to_hf( 75 | hf_model.config, variables_data.decoder[l], hf_model.h[l] 76 | ).items() 77 | } 78 | ) 79 | return state_dict 80 | -------------------------------------------------------------------------------- /gptj-text-generation/modelling/hf_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | 3 | from typing import Dict 4 | import numpy as np 5 | 6 | from transformers.models.gptj import GPTJModel as HFModel 7 | from transformers.models.gptj import GPTJForCausalLM as HFLMHeadModel 8 | from transformers.models.gptj.configuration_gptj import GPTJConfig as GPTJConfigHF 9 | 10 | import popxl 11 | from popxl_addons import TaskSession 12 | 13 | from config import GPTJConfig 14 | from modelling.gptj_model import GPTJModelTP 15 | from modelling.gptj_lm import GPTJLMHeadLossAndGradTP, GPTJLMHeadModelTP 16 | 17 | 18 | def hf_mapping_lm_tp( 19 | config: GPTJConfig, session: TaskSession, pretrained: HFLMHeadModel 20 | ) -> Dict[popxl.Tensor, np.ndarray]: 21 | load_to = session.state 22 | if "fwd" in session.state: 23 | load_to = session.state.fwd 24 | weights = GPTJLMHeadModelTP.hf_mapping(config, load_to, pretrained) 25 | return weights 26 | 27 | 28 | def hf_mapping_TP(config: GPTJConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 29 | load_to = session.state 30 | if "fwd" in session.state: 31 | load_to = session.state.fwd 32 | weights = GPTJModelTP.hf_mapping(config, load_to, pretrained) 33 | return weights 34 | 35 | 36 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel: 37 | weights = session.get_named_tensors_data() 38 | if "fwd" in weights: 39 | weights = weights.fwd 40 | state_dict = GPTJLMHeadModelTP.to_hf(weights, hf_model) 41 | # check only missing keys are mask-related keys 42 | hf_state_keys = hf_model.state_dict().keys() 43 | popxl_keys = state_dict.keys() 44 | 45 | def should_check(k: str): 46 | return "attn.bias" not in k and "attn.masked_bias" not in k 47 | 48 | for k in hf_state_keys: 49 | if should_check(k) and k not in popxl_keys: 50 | raise KeyError(f"key {k} not found in session state") 51 | 52 | hf_model.load_state_dict(state_dict, strict=False) 53 | return hf_model 54 | 55 | 56 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel: 57 | weights = session.get_named_tensors_data() 58 | if "fwd" in weights: 59 | weights = weights.fwd 60 | 61 | state_dict = GPTJModelTP.to_hf(weights, hf_model) 62 | # check only missing keys are mask-related keys 63 | hf_state_keys = hf_model.state_dict().keys() 64 | popxl_keys = state_dict.keys() 65 | 66 | def should_check(k: str): 67 | return "attn.bias" not in k and "attn.masked_bias" not in k 68 | 69 | for k in hf_state_keys: 70 | if should_check(k) and k not in popxl_keys: 71 | raise KeyError(f"key {k} not found in session state") 72 | 73 | hf_model.load_state_dict(state_dict, strict=False) 74 | return hf_model 75 | -------------------------------------------------------------------------------- /gptj-text-generation/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = 3 | tests 4 | 5 | addopts = 6 | -r a 7 | -v 8 | 9 | python_paths = . ../../../utils/ 10 | -------------------------------------------------------------------------------- /gptj-text-generation/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-pythonpath 3 | -------------------------------------------------------------------------------- /gptj-text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/cpu/torch_stable.html 2 | 3 | 4 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 5 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@7cd37a8eccabe88e3741eef2c31bafd4fcd30c4c 6 | pyyaml==5.4.1 7 | dataclasses==0.8; python_version < '3.7' 8 | transformers==4.25.0 9 | datasets 10 | evaluate==0.4.0 11 | tfrecord==1.14.1 12 | torch==2.0.1+cpu 13 | scipy>=1.5.4 14 | more-itertools==8.13.0 15 | wandb==0.12.8 16 | sklearn==0.0 17 | 18 | pytest==6.2.5 19 | pytest-pythonpath==0.7.4 20 | 21 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0 22 | 23 | protobuf==3.20.*; python_version > '3.6' 24 | -------------------------------------------------------------------------------- /gptj-text-generation/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | import os 3 | from config import GPTJConfig 4 | 5 | import pytest 6 | 7 | from config import GPTJConfig 8 | from utils.simple_parsing_tools import parse_args_with_config_file 9 | 10 | 11 | def _test_config_file(): 12 | return os.path.join(os.path.dirname(__file__), "test_config.yml") 13 | 14 | 15 | @pytest.fixture 16 | def test_config_file(): 17 | return _test_config_file() 18 | 19 | 20 | @pytest.fixture 21 | def test_config(): 22 | return parse_args_with_config_file(GPTJConfig, ["--config", _test_config_file()]) 23 | 24 | 25 | # Below functions enable long tests to be skipped, unless a --long-test 26 | # cli option is specified. 27 | def pytest_addoption(parser): 28 | parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests") 29 | -------------------------------------------------------------------------------- /gptj-text-generation/tests/integration/execution/test_execution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | from pathlib import Path 3 | from examples_tests.test_util import SubProcessChecker 4 | import os 5 | import sys 6 | 7 | root_dir = Path(__file__).parent.parent.parent.parent.resolve() 8 | 9 | 10 | def gptj_root_env_path(): 11 | env = os.environ 12 | env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir))) 13 | return env 14 | 15 | 16 | class TestPretraining(SubProcessChecker): 17 | def test_finetuning(self): 18 | self.run_command( 19 | "python3 finetuning.py --config tiny --layers 3 " 20 | "--global_batch_size 16 --micro_batch_size 2 --data_parallel 2 --tensor_parallel 2 " 21 | "--vocab_size 128 --sequence_length 8 --rotary_dim 16 " 22 | "--hidden_size 64 --heads 4", 23 | root_dir, 24 | ["Duration"], 25 | env=gptj_root_env_path(), 26 | ) 27 | 28 | def test_inference(self): 29 | self.run_command( 30 | "python3 inference.py --config tiny --layers 3 " 31 | "--micro_batch_size 16 --data_parallel 1 --tensor_parallel 2 " 32 | "--vocab_size 128 --sequence_length 8 --rotary_dim 16 " 33 | "--hidden_size 64 --heads 4", 34 | root_dir, 35 | ["Duration"], 36 | env=gptj_root_env_path(), 37 | ) 38 | -------------------------------------------------------------------------------- /gptj-text-generation/tests/test_config.yml: -------------------------------------------------------------------------------- 1 | model: 2 | sequence_length: 8 3 | embedding: 4 | vocab_size: 128 5 | hidden_size: 64 6 | layers: 2 7 | attention: 8 | heads: 4 9 | rotary_dim: 8 10 | eval: True 11 | precision: "float32" 12 | training: 13 | global_batch_size: 2 14 | execution: 15 | micro_batch_size: 2 16 | data_parallel: 1 17 | attention_serialisation: 2 18 | -------------------------------------------------------------------------------- /gptj-text-generation/tests_serial/distributed_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | import popxl 3 | from popxl import ops 4 | from data.data_utils import DistributedSampler, WorkerInit 5 | from torch.utils.data import DataLoader 6 | from typing import Tuple 7 | import numpy as np 8 | from pathlib import Path 9 | from mpi4py import MPI 10 | import time 11 | import popdist 12 | 13 | 14 | def sample_program(input_shape: Tuple, replicas: int): 15 | ir = popxl.Ir("popdist") 16 | 17 | with ir.main_graph: 18 | x_h2d = popxl.h2d_stream(input_shape, dtype=popxl.float32, name="x_in") 19 | x_d2h = popxl.d2h_stream(input_shape, dtype=popxl.float32, name="x_out") 20 | x = ops.host_load(x_h2d) 21 | ops.host_store(x_d2h, x) 22 | 23 | ir.num_host_transfers = 1 24 | return popxl.Session(ir, "ipu_hw"), x_h2d, x_d2h 25 | 26 | 27 | def distributed_sampler(): 28 | bs = 2 29 | inps = 5 30 | dataset_size = 2 * 10 31 | worker_seed = 47 32 | workers = 4 33 | epochs = 3 34 | replicas = 2 35 | 36 | dataset = np.random.random((dataset_size, inps)).astype(np.float32) 37 | sampler = DistributedSampler(dataset) 38 | dl = DataLoader( 39 | dataset, 40 | batch_size=bs, 41 | drop_last=True, 42 | num_workers=workers, 43 | worker_init_fn=WorkerInit(worker_seed), 44 | persistent_workers=workers > 0, 45 | sampler=sampler, 46 | ) 47 | session, in_stream, out_stream = sample_program((bs, inps), replicas) 48 | 49 | # check each instance get different data 50 | loader_list = list(dl)[0][0][0].numpy() 51 | 52 | # MPI to broadcast data in root=1 to root=0 53 | comm = MPI.COMM_WORLD 54 | rank = comm.Get_rank() 55 | loader_list_copy = np.copy(loader_list) 56 | comm.Bcast(loader_list, root=1) 57 | 58 | # Assert if data broadcast to root=0 is different 59 | if comm.Get_rank() == 0 and not np.all(loader_list_copy == loader_list): 60 | print("Passed test: instances have different data") 61 | 62 | # Wait until both roots are finished 63 | time.sleep(2) 64 | 65 | # check epochs behaviour 66 | epochs_first_data = [] 67 | for epoch in range(epochs): 68 | # set epoch explicitly before iterating dl 69 | sampler.set_epoch(epoch) 70 | step = 0 71 | for data in dl: 72 | x = data 73 | with session: 74 | out = session.run({in_stream: x})[out_stream] 75 | if step == 0: 76 | epochs_first_data.append(out) 77 | step += 1 78 | 79 | assert len(epochs_first_data) == epochs, f"Expected {epochs} elements to compare, found {len(epochs_first_data)}" 80 | # check each epoch data is sampled in different order 81 | for first_item in epochs_first_data[1:]: 82 | not np.all(first_item == epochs_first_data[0]) 83 | print("Passed test: each epoch samples dataset in different order") 84 | 85 | 86 | if __name__ == "__main__": 87 | distributed_sampler() 88 | -------------------------------------------------------------------------------- /gptj-text-generation/tests_serial/test_distributed_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | from examples_tests.test_util import SubProcessChecker 3 | from pathlib import Path 4 | import os 5 | import subprocess 6 | import pytest 7 | import gc 8 | 9 | root_dir = Path(__file__).parent.parent.resolve() 10 | 11 | 12 | def run_poprun_cmdline(poprun_args, cmdline_args, script): 13 | cmd = ["poprun"] 14 | cmd.extend([str(item) for sublist in poprun_args.items() for item in sublist if item != ""]) 15 | cmd.append("python3") 16 | cmd.append(script) 17 | cmd.extend([str(item) for sublist in cmdline_args.items() for item in sublist if item != ""]) 18 | try: 19 | out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=root_dir) 20 | except subprocess.CalledProcessError as e: 21 | print(f"TEST FAILED") 22 | print(f"stdout={e.stdout.decode('utf-8',errors='ignore')}") 23 | print(f"stderr={e.stderr.decode('utf-8',errors='ignore')}") 24 | raise 25 | return out, out.stdout.decode("utf-8"), out.stderr.decode("utf-8") 26 | 27 | 28 | def test_poprun_dataset(): 29 | """ 30 | Launch poprun as subprocess and assert output 31 | """ 32 | gc.collect() 33 | out, stdout, stderr = run_poprun_cmdline( 34 | { 35 | "--num-instances": 2, 36 | "--num-replicas": 2, 37 | }, 38 | {}, 39 | os.path.join(root_dir, "tests_serial/distributed_sampler.py"), 40 | ) 41 | assert "Passed test: instances have different data" in stdout, stderr 42 | assert "Passed test: each epoch samples dataset in different order" in stdout, stderr 43 | 44 | 45 | def test_poprun_dataloader_checkpoints(): 46 | """ 47 | Launch poprun as subprocess and assert output 48 | """ 49 | gc.collect() 50 | out, stdout, stderr = run_poprun_cmdline( 51 | { 52 | "--num-instances": 2, 53 | "--num-replicas": 2, 54 | }, 55 | {}, 56 | os.path.join(root_dir, "tests_serial/dataloader_checkpoints.py"), 57 | ) 58 | assert "Passed test: distributed dataloader checkpoint" in stdout, stderr 59 | -------------------------------------------------------------------------------- /gptj-text-generation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | from .simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /gptj-text-generation/utils/simple_parsing_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | 3 | from examples_utils.parsing.simple_parsing_tools import * 4 | -------------------------------------------------------------------------------- /images/folder_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/folder_logo.png -------------------------------------------------------------------------------- /images/go_emotions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/go_emotions.png -------------------------------------------------------------------------------- /images/jupyter_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/jupyter_logo.png -------------------------------------------------------------------------------- /llama2-chatbot/.gitignore: -------------------------------------------------------------------------------- 1 | .graphcore/ 2 | .ipynb_checkpoints/ 3 | .exe_cache/ 4 | .__pycache__/ 5 | *.pyc 6 | -------------------------------------------------------------------------------- /llama2-chatbot/api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .pipeline import LlamaPipeline 4 | -------------------------------------------------------------------------------- /llama2-chatbot/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .config import LlamaConfig, Execution 4 | import os 5 | from pathlib import Path 6 | 7 | CONFIG_DIR = Path(os.path.dirname(__file__)) 8 | 9 | del os, Path 10 | -------------------------------------------------------------------------------- /llama2-chatbot/config/inference.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | tiny: &tiny 3 | model: 4 | layers: 2 5 | hidden_size: 100 6 | sequence_length: 64 7 | attention: 8 | heads: 4 9 | rotary_dim: 4 10 | embedding: 11 | vocab_size: 150 12 | 13 | llama2_7b: &llama2_7b 14 | model: 15 | layers: 32 16 | hidden_size: 4096 17 | intermediate_size: 11008 18 | sequence_length: 2048 19 | attention: 20 | heads: 32 21 | embedding: 22 | vocab_size: 32000 23 | eps: 1.0e-6 24 | 25 | llama2_13b: &llama2_13b 26 | model: 27 | layers: 40 28 | hidden_size: 5120 29 | intermediate_size: 13824 30 | sequence_length: 2048 31 | attention: 32 | heads: 40 33 | embedding: 34 | vocab_size: 32000 35 | eps: 1.0e-5 36 | 37 | # ------------------------- 38 | 39 | # ------- Execution ------- 40 | release: 41 | tiny: 42 | <<: *tiny 43 | execution: 44 | micro_batch_size: 4 45 | available_memory_proportion: [ 0.4 ] 46 | tensor_parallel: 4 47 | 48 | llama2_7b_pod2: 49 | <<: *llama2_7b 50 | execution: 51 | micro_batch_size: 1 52 | available_memory_proportion: [ 0.1 ] 53 | tensor_parallel: 2 54 | 55 | llama2_7b_pod4: 56 | <<: *llama2_7b 57 | execution: 58 | micro_batch_size: 1 59 | available_memory_proportion: [ 0.4 ] 60 | tensor_parallel: 4 61 | 62 | llama2_7b_pod16: 63 | <<: *llama2_7b 64 | execution: 65 | micro_batch_size: 1 66 | available_memory_proportion: [ 0.4 ] 67 | tensor_parallel: 16 68 | 69 | llama2_13b_pod4: 70 | <<: *llama2_13b 71 | execution: 72 | micro_batch_size: 1 73 | available_memory_proportion: [ 0.4 ] 74 | tensor_parallel: 4 75 | 76 | # TODO: Add attention padding to support attention tensor-parallel up to 16 77 | llama2_13b_pod16: 78 | <<: *llama2_13b 79 | execution: 80 | micro_batch_size: 1 81 | available_memory_proportion: [ 0.4 ] 82 | tensor_parallel: 16 83 | attention_tensor_parallel: 4 84 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | 5 | import popxl 6 | from popxl import ops 7 | from popxl.utils import to_numpy 8 | 9 | import popxl_addons as addons 10 | from popxl_addons import NamedTensors 11 | 12 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer as HFModel 13 | 14 | from config import LlamaConfig 15 | from .attention import LlamaSelfAttentionTP 16 | from .feed_forward import LlamaFeedForwardTP 17 | from .rms_norm import LlamaRMSNorm 18 | 19 | 20 | class LlamaDecoderBlockTP(addons.Module): 21 | def __init__(self, config: LlamaConfig): 22 | super().__init__() 23 | self.config = config 24 | # begins with identical computations: layer norm ln_1 25 | self.ln_1 = LlamaRMSNorm(self.config) 26 | # attention is sharded 27 | # identical computation for bias and skip connection 28 | self.attention = LlamaSelfAttentionTP(self.config) 29 | # begins with identical computations: layer norm ln_2 30 | self.ln_2 = LlamaRMSNorm(self.config) 31 | # feed forward is sharded 32 | # identical computation for bias, dropout and skip connection 33 | self.feed_forward = LlamaFeedForwardTP(self.config) 34 | 35 | def build(self, x: popxl.Tensor): 36 | initial_residual = x 37 | ax = self.ln_1(x) 38 | ax = self.attention(ax) 39 | ax = initial_residual + ax 40 | 41 | post_attn_residual = ax 42 | fx = self.ln_2(ax) 43 | fx = self.feed_forward(fx) 44 | 45 | hs = post_attn_residual + fx 46 | return hs 47 | 48 | 49 | @staticmethod 50 | def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 51 | 52 | dtype = config.model.dtype 53 | weights = { 54 | variables.ln_1.weight: to_numpy(hf_model.input_layernorm.weight.data, dtype), 55 | variables.ln_2.weight: to_numpy(hf_model.post_attention_layernorm.weight.data, dtype), 56 | } 57 | weights.update(LlamaSelfAttentionTP.hf_mapping(config, variables.attention, hf_model.self_attn)) 58 | weights.update(LlamaFeedForwardTP.hf_mapping(config, variables.feed_forward, hf_model.mlp)) 59 | 60 | return weights 61 | 62 | 63 | class LlamaDecoderTP(addons.Module): 64 | def __init__(self, config: LlamaConfig): 65 | super().__init__() 66 | self.config = config 67 | 68 | def build(self, x: popxl.Tensor): 69 | 70 | facts, graph = LlamaDecoderBlockTP(self.config).create_graph(x) # Outline GPT Layer 71 | 72 | for i in range(self.config.model.layers): 73 | args_nt = self.add_variable_inputs(i, facts) 74 | (x,) = graph.bind(args_nt).call(x) 75 | 76 | return x 77 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from typing import Optional, List, Dict 3 | import popxl 4 | from popxl import ops 5 | from popxl.utils import to_numpy 6 | from transformers.models.llama.modeling_llama import LlamaMLP as HFModel 7 | 8 | import popxl_addons as addons 9 | from config import LlamaConfig 10 | from popxl_addons.layers import Linear 11 | import numpy as np 12 | 13 | from popxl_addons.named_tensors import NamedTensors 14 | from popxl_addons.ops.replicated_all_reduce_TP import replicated_all_reduce 15 | from popxl_addons.array_munging import shard 16 | 17 | 18 | class LlamaFeedForwardTP(addons.Module): 19 | def __init__(self, config: LlamaConfig): 20 | super().__init__() 21 | self.config = config 22 | tp = config.execution.tensor_parallel 23 | dp = config.execution.data_parallel 24 | self.n_shards = tp 25 | self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=dp) 26 | self.intermediate_size = self.config.model.intermediate_size 27 | self.hidden_size = self.config.model.hidden_size 28 | 29 | # ----- Layers ----- 30 | # Sharded across devices - column wise 31 | self.gate_proj = Linear( 32 | self.intermediate_size // self.n_shards, bias=False, replica_grouping=self.replica_grouping) 33 | self.up_proj = Linear( 34 | self.intermediate_size // self.n_shards, bias=False, replica_grouping=self.replica_grouping) 35 | 36 | # Sharded across devices - row wise (no bias) 37 | self.down_proj = Linear( 38 | self.hidden_size, bias=False, replica_grouping=self.replica_grouping) 39 | 40 | def build(self, x: popxl.Tensor) -> List[popxl.Tensor]: 41 | """Identical input (x, seed) and identical output across shards.""" 42 | # ----- Sharded computation ----- 43 | 44 | # Shard column-wise since gelu is not linear. 45 | # Indeed, sharding row wise requires a sum AllReduce at the end, 46 | # but swish is not linear: swish(x+y) != swish(x) + swish(y) 47 | up = self.up_proj(x) 48 | 49 | gp = self.gate_proj(x) 50 | gp_act = ops.swish(gp) 51 | # Here, x is already sharded across devices. Since we don't have non linearities, 52 | # we can shard row-wise (which requires both X and the weight matrix to be sharded) 53 | # and then perform an all reduce 54 | z = gp_act * up 55 | 56 | z = self.down_proj(z) 57 | z = replicated_all_reduce(z, group=self.replica_grouping.transpose()) 58 | return z 59 | 60 | @staticmethod 61 | def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 62 | dtype = config.model.dtype 63 | n_shards = config.execution.tensor_parallel 64 | 65 | return { 66 | variables.gate_proj.weight: shard( 67 | to_numpy(hf_model.gate_proj.weight.data.T, dtype), n_shards, axis=-1 68 | ), 69 | variables.up_proj.weight: shard( 70 | to_numpy(hf_model.up_proj.weight.data.T, dtype), n_shards, axis=-1 71 | ), 72 | variables.down_proj.weight: shard( 73 | to_numpy(hf_model.down_proj.weight.data.T, dtype), n_shards, axis=0 74 | ), 75 | } 76 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/hf_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from typing import Dict 4 | import numpy as np 5 | 6 | from transformers.models.llama import LlamaModel as HFModel 7 | from transformers.models.llama import LlamaForCausalLM as HFLMHeadModel 8 | 9 | import popxl 10 | from popxl_addons import TaskSession 11 | 12 | from config import LlamaConfig 13 | from modelling.llama_model import LlamaModelTP 14 | from modelling.llama_lm import LlamaLMHeadModelTP 15 | 16 | 17 | def hf_mapping_lm_tp( 18 | config: LlamaConfig, session: TaskSession, pretrained: HFLMHeadModel 19 | ) -> Dict[popxl.Tensor, np.ndarray]: 20 | load_to = session.state 21 | if "fwd" in session.state: 22 | load_to = session.state.fwd 23 | weights = LlamaLMHeadModelTP.hf_mapping(config, load_to, pretrained) 24 | return weights 25 | 26 | 27 | def hf_mapping_TP(config: LlamaConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 28 | load_to = session.state 29 | if "fwd" in session.state: 30 | load_to = session.state.fwd 31 | weights = LlamaModelTP.hf_mapping(config, load_to, pretrained) 32 | return weights 33 | 34 | 35 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel: 36 | weights = session.get_named_tensors_data() 37 | if "fwd" in weights: 38 | weights = weights.fwd 39 | state_dict = LlamaLMHeadModelTP.to_hf(weights, hf_model) 40 | # check only missing keys are mask-related keys 41 | hf_state_keys = hf_model.state_dict().keys() 42 | popxl_keys = state_dict.keys() 43 | 44 | def should_check(k: str): 45 | return "attn.bias" not in k and "attn.masked_bias" not in k 46 | 47 | for k in hf_state_keys: 48 | if should_check(k) and k not in popxl_keys: 49 | raise KeyError(f"key {k} not found in session state") 50 | 51 | hf_model.load_state_dict(state_dict, strict=False) 52 | return hf_model 53 | 54 | 55 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel: 56 | weights = session.get_named_tensors_data() 57 | if "fwd" in weights: 58 | weights = weights.fwd 59 | 60 | state_dict = LlamaModelTP.to_hf(weights, hf_model) 61 | # check only missing keys are mask-related keys 62 | hf_state_keys = hf_model.state_dict().keys() 63 | popxl_keys = state_dict.keys() 64 | 65 | def should_check(k: str): 66 | return "attn.bias" not in k and "attn.masked_bias" not in k 67 | 68 | for k in hf_state_keys: 69 | if should_check(k) and k not in popxl_keys: 70 | raise KeyError(f"key {k} not found in session state") 71 | 72 | hf_model.load_state_dict(state_dict, strict=False) 73 | return hf_model 74 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/llama_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | from config import LlamaConfig 5 | 6 | import popxl 7 | from popxl.utils import to_numpy 8 | 9 | import popxl_addons as addons 10 | from popxl_addons import NamedTensors 11 | 12 | # from popxl_addons.layers import LayerNorm 13 | from .rms_norm import LlamaRMSNorm 14 | 15 | from .embedding import LlamaEmbeddingsTP 16 | from .decoder import LlamaDecoderTP, LlamaDecoderBlockTP 17 | 18 | from transformers.models.llama.modeling_llama import LlamaModel as HFModel 19 | 20 | 21 | class LlamaModelTP(addons.Module): 22 | def __init__(self, config: LlamaConfig, include_layer_norm=True): 23 | super().__init__() 24 | self.config = config 25 | # sharded, then last bit identical 26 | self.embeddings = LlamaEmbeddingsTP(self.config) 27 | # identical inputs, then sharded, then identical 28 | self.decoder = LlamaDecoderTP(self.config) 29 | # identical 30 | self.include_layer_norm = include_layer_norm 31 | if self.include_layer_norm: 32 | self.ln_f = LlamaRMSNorm(self.config) 33 | 34 | def build(self, input_ids: popxl.Tensor): 35 | x = self.embeddings(input_ids) 36 | x = self.decoder(x) 37 | if self.include_layer_norm: 38 | x = self.ln_f(x) 39 | return x 40 | 41 | @staticmethod 42 | def hf_mapping( 43 | config: LlamaConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True 44 | ) -> Dict[popxl.Tensor, np.ndarray]: 45 | dtype = config.model.dtype 46 | 47 | weights = {} 48 | if layer_norm: 49 | weights = { 50 | variables.ln_f.weight: to_numpy(hf_model.norm.weight.data, dtype), 51 | } 52 | 53 | weights.update(LlamaEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model)) 54 | 55 | for l in range(config.model.layers): 56 | weights.update(LlamaDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.layers[l])) 57 | 58 | return weights 59 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from functools import partial 3 | from typing import Dict 4 | import torch 5 | import popxl 6 | from popxl import ops 7 | from popxl.utils import to_numpy 8 | from transformers.models.llama.modeling_llama import LlamaRMSNorm as HFModel 9 | 10 | import popxl_addons as addons 11 | from config import LlamaConfig 12 | from popxl_addons.named_tensors import NamedTensorData 13 | import numpy as np 14 | 15 | from popxl_addons.named_tensors import NamedTensors 16 | 17 | 18 | class LlamaRMSNorm(addons.Module): 19 | def __init__(self, config: LlamaConfig): 20 | super().__init__() 21 | self.eps = config.model.eps 22 | 23 | def build(self, x: popxl.Tensor) -> popxl.Tensor: 24 | """ 25 | Build RMS layer normalisation for Llama. No bias and no subtraction of mean. This is equivalent to T5LayerNorm. 26 | """ 27 | w = self.add_variable_input("weight", partial(np.ones, x.shape[-1]), x.dtype) 28 | 29 | # Perform the computation in float32 30 | if x.dtype == popxl.float16: 31 | x = ops.cast(x, popxl.float32) 32 | 33 | variance = ops.mean(x * x, -1, keepdims=True) 34 | 35 | x = x / ops.sqrt(variance + self.eps) 36 | 37 | # Cast back down to float16 if needed 38 | if w.dtype == popxl.float16: 39 | x = ops.cast(x, popxl.float16) 40 | 41 | x = x * w 42 | return x 43 | 44 | @staticmethod 45 | def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 46 | dtype = config.model.dtype 47 | weights = { 48 | variables.weight: to_numpy(hf_model.weight.data, dtype), 49 | } 50 | return weights -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/.rendered.rotary_pos_embed_binding.cpp: -------------------------------------------------------------------------------- 1 | // cppimport 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "common.hpp" 32 | #include "rotary_pos_embed.hpp" 33 | #include "rotary_pos_embedx.hpp" 34 | 35 | namespace py = pybind11; 36 | 37 | // -------------- PyBind -------------- 38 | // `rotary_pos_embed_binding` must equal filename 39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) { 40 | // Bindings the parameters of the op: constructor + fields. 41 | py::class_> 43 | binding(m, "RotaryPosEmbedOp"); 44 | binding.def_static( 45 | "createOpInGraph", 46 | py::overload_cast( 48 | &popart::RotaryPosEmbedOp::createOpInGraph), 49 | py::arg("graph"), py::arg("inputs"), py::arg("outputs"), 50 | py::arg("rotaryDim"), py::arg("settings"), 51 | py::return_value_policy::reference); 52 | binding.def("outTensor", 53 | py::overload_cast(&popart::RotaryPosEmbedOp::outTensor), 54 | py::return_value_policy::reference); 55 | }; 56 | 57 | // -------------- cppimport -------------- 58 | // cppimport configuration for compiling the pybind11 module. 59 | // clang-format off 60 | /* 61 | 62 | */ 63 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .rotary_pos_embed import * 3 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/common.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_ROTARYPOSEMBED_OPIDS 3 | #define GUARD_ROTARYPOSEMBED_OPIDS 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using InMapType = std::map; 11 | using OutMapType = std::map; 12 | using OutIndex = int; 13 | 14 | namespace popart { 15 | 16 | #define CUSTOM_OP_DOMAIN "popxl.addons.ops" 17 | 18 | const popart::OperatorIdentifier RotaryPosEmbed = OperatorIdentifier{ 19 | CUSTOM_OP_DOMAIN, 20 | "RotaryPosEmbed", 21 | 1, // Op version 22 | {3, 3}, // number of inputs 23 | 1 // number of outputs 24 | }; 25 | 26 | const popart::OperatorIdentifier RotaryPosEmbedGrad = OperatorIdentifier{ 27 | CUSTOM_OP_DOMAIN, 28 | "RotaryPosEmbedGrad", 29 | 1, // Op version 30 | {3, 3}, // number of inputs 31 | 1 // number of outputs 32 | }; 33 | 34 | } // namespace popart 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "common.hpp" 19 | #include "rotary_pos_embed.hpp" 20 | 21 | namespace popart { 22 | 23 | ///////////////////////////////////////////////////////////// 24 | ////// Fwd op 25 | 26 | RotaryPosEmbedOp::RotaryPosEmbedOp(const OperatorIdentifier &_opid, 27 | uint32_t rotary_dim_, 28 | const Op::Settings &settings_) 29 | : Op(_opid, settings_), rotary_dim{rotary_dim_} { 30 | if ((rotary_dim % 2) != 0) { 31 | throw error("RotaryPosEmbedOp::RotaryPosEmbedOp rotary_dim must be a " 32 | "multiple of 2"); 33 | } 34 | } 35 | 36 | std::unique_ptr RotaryPosEmbedOp::clone() const { 37 | return std::make_unique(*this); 38 | } 39 | 40 | std::vector> RotaryPosEmbedOp::getGradOps() { 41 | std::vector> result; 42 | result.push_back(std::make_unique(*this)); 43 | return result; 44 | } 45 | 46 | void RotaryPosEmbedOp::setup() { 47 | auto xInfo = inInfo(0); 48 | auto cosInfo = inInfo(1); 49 | auto sinInfo = inInfo(2); 50 | 51 | // check expected shapes 52 | if (xInfo.rank() != 4) { 53 | throw error( 54 | "RotaryPosEmbedOp::setup x should have rank 4 (batch, heads, seq, hh)"); 55 | } 56 | if (cosInfo.rank() != 3 || sinInfo.rank() != 3) { 57 | throw error("RotaryPosEmbedOp::setup trig functions should have rank 3 " 58 | "(1 or batch, seq, hh/2)"); 59 | } 60 | if ((rotary_dim % 2) != 0) { 61 | throw error("RotaryPosEmbedOp::setup rotary dim must be a multiple of 2"); 62 | } 63 | 64 | // x rotated 65 | outInfo(0) = xInfo; 66 | } 67 | 68 | void RotaryPosEmbedOp::appendOutlineAttributes(OpSerialiserBase &os) const { 69 | os.appendAttribute("rotary_dim", rotary_dim); 70 | Op::appendOutlineAttributes(os); 71 | } 72 | 73 | ///////////////////////////////////////////////////////////// 74 | ////// Grad op 75 | 76 | RotaryPosEmbedGradOp::RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op) 77 | : Op(RotaryPosEmbedGrad, op.getSettings()), rotary_dim{op.rotary_dim} {} 78 | 79 | const std::map &RotaryPosEmbedGradOp::gradOutToNonGradIn() const { 80 | static const std::map outInfo = {{0, 0}}; 81 | return outInfo; 82 | } 83 | 84 | const std::vector & 85 | RotaryPosEmbedGradOp::gradInputInfo() const { 86 | static const std::vector inInfo = { 87 | {0, 0, GradOpInType::GradOut}, 88 | {1, 1, GradOpInType::In}, 89 | {2, 2, GradOpInType::In}}; 90 | return inInfo; 91 | } 92 | 93 | void RotaryPosEmbedGradOp::setup() { outInfo(0) = inInfo(0); } 94 | 95 | std::unique_ptr RotaryPosEmbedGradOp::clone() const { 96 | return std::make_unique(*this); 97 | } 98 | 99 | void RotaryPosEmbedGradOp::appendOutlineAttributes(OpSerialiserBase &os) const { 100 | os.appendAttribute("rotary_dim", rotary_dim); 101 | Op::appendOutlineAttributes(os); 102 | } 103 | 104 | } // namespace popart 105 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_NEURALNET_STRIDEDSLICE_HPP 3 | #define GUARD_NEURALNET_STRIDEDSLICE_HPP 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "common.hpp" 10 | 11 | namespace popart { 12 | 13 | class RotaryPosEmbedOp : public Op { 14 | public: 15 | RotaryPosEmbedOp(const OperatorIdentifier &_opid, uint32_t rotary_dim_, 16 | const Op::Settings &settings_); 17 | 18 | std::unique_ptr clone() const override; 19 | std::vector> getGradOps() override; 20 | void setup() final; 21 | 22 | float getSubgraphValue() const override { return getHighSubgraphValue(); } 23 | 24 | static RotaryPosEmbedOp * 25 | createOpInGraph(popart::Graph &graph, const InMapType &in, 26 | const OutMapType &out, uint32_t rotary_dim_, 27 | const popart::Op::Settings &settings) { 28 | return graph.createConnectedOp(in, out, RotaryPosEmbed, 29 | rotary_dim_, settings); 30 | } 31 | 32 | void appendOutlineAttributes(OpSerialiserBase &) const override; 33 | 34 | uint32_t rotary_dim = 0; 35 | }; 36 | 37 | class RotaryPosEmbedGradOp : public Op { 38 | public: 39 | RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op); 40 | 41 | void setup() final; 42 | std::unique_ptr clone() const override; 43 | const std::vector &gradInputInfo() const final; 44 | const std::map &gradOutToNonGradIn() const final; 45 | 46 | float getSubgraphValue() const override { return getHighSubgraphValue(); } 47 | 48 | void appendOutlineAttributes(OpSerialiserBase &) const override; 49 | 50 | uint32_t rotary_dim = 0; 51 | }; 52 | 53 | } // namespace popart 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpp: -------------------------------------------------------------------------------- 1 | // cppimport 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "common.hpp" 32 | #include "rotary_pos_embed.hpp" 33 | #include "rotary_pos_embedx.hpp" 34 | 35 | namespace py = pybind11; 36 | 37 | // -------------- PyBind -------------- 38 | // `rotary_pos_embed_binding` must equal filename 39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) { 40 | // Bindings the parameters of the op: constructor + fields. 41 | py::class_> 43 | binding(m, "RotaryPosEmbedOp"); 44 | binding.def_static( 45 | "createOpInGraph", 46 | py::overload_cast( 48 | &popart::RotaryPosEmbedOp::createOpInGraph), 49 | py::arg("graph"), py::arg("inputs"), py::arg("outputs"), 50 | py::arg("rotaryDim"), py::arg("settings"), 51 | py::return_value_policy::reference); 52 | binding.def("outTensor", 53 | py::overload_cast(&popart::RotaryPosEmbedOp::outTensor), 54 | py::return_value_policy::reference); 55 | }; 56 | 57 | // -------------- cppimport -------------- 58 | // cppimport configuration for compiling the pybind11 module. 59 | // clang-format off 60 | /* 61 | <% 62 | cfg['sources'] = ['rotary_pos_embed.cpp', 'rotary_pos_embedx.cpp'] 63 | cfg['extra_compile_args'] = ['-std=c++14', '-fPIC', '-O2', '-DONNX_NAMESPACE=onnx', '-Wall', '-Wno-sign-compare'] 64 | cfg['libraries'] = ['popart', 'poputil', 'popops', 'poplin', 'popnn', 'poprand', 'gcl'] 65 | setup_pybind11(cfg) 66 | %> 67 | */ 68 | -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embedx.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved. 2 | #ifndef GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP 3 | #define GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace popart { 10 | namespace popx { 11 | 12 | class RotaryPosEmbedOpx : public Opx { 13 | public: 14 | RotaryPosEmbedOpx(Op *, Devicex *); 15 | 16 | void grow(poplar::program::Sequence &) const; 17 | }; 18 | 19 | class RotaryPosEmbedGradOpx : public Opx { 20 | public: 21 | RotaryPosEmbedGradOpx(Op *, Devicex *); 22 | 23 | void grow(poplar::program::Sequence &) const; 24 | }; 25 | 26 | } // namespace popx 27 | } // namespace popart 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /llama2-chatbot/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = 3 | tests 4 | 5 | addopts = 6 | -r a 7 | -v 8 | 9 | python_paths = . ../../../utils/ 10 | -------------------------------------------------------------------------------- /llama2-chatbot/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.2 2 | scipy==1.10.1 3 | 4 | transformers==4.31.0 5 | huggingface-hub==0.16.4 6 | accelerate==0.20.3 7 | sentencepiece==0.1.99 8 | 9 | pytest==6.2.5 10 | pytest-pythonpath==0.7.4 11 | 12 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0 13 | 14 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 15 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@v3.3 16 | 17 | -f https://download.pytorch.org/whl/torch_stable.html 18 | torch==2.0.1+cpu -------------------------------------------------------------------------------- /llama2-chatbot/run-inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 3 | import logging 4 | from typing import Optional, Tuple 5 | 6 | from transformers import AutoTokenizer 7 | from utils.setup import llama_config_setup 8 | from config import LlamaConfig 9 | from api import LlamaPipeline 10 | 11 | def run_inference_popxl(config: LlamaConfig, tokenizer, hf_model, sequence_length: Optional[int] = None): 12 | if sequence_length is not None: 13 | config.model.sequence_length = sequence_length 14 | 15 | pipe = LlamaPipeline(config, hf_llama_checkpoint=hf_model, tokenizer=tokenizer) 16 | 17 | def get_input() -> Tuple[str, float, int, int]: 18 | while True: 19 | try: 20 | logging.info("-- Enter prompt --") 21 | prompt = input("> ") 22 | logging.info("-- Enter Sampling Temperature (0 for greedy) --") 23 | temperature = float(input("> ")) 24 | logging.info("-- Enter top-k parameter (0 for max) --") 25 | k = int(input("> ")) 26 | logging.info("-- Enter number of tokens to generate --") 27 | num_tokens = int(input("> ")) 28 | break 29 | except ValueError: 30 | logging.info("Invalid input!") 31 | 32 | return prompt, temperature, k, num_tokens 33 | 34 | while True: 35 | prompt, temperature, k, output_length = get_input() 36 | pipe(prompt, k=k, temperature=temperature, output_length=output_length)[0] 37 | 38 | 39 | def main(): 40 | # --- Setup --- 41 | config, _, hf_model = llama_config_setup("config/inference.yml", "release", "llama2_7b_pod4", hf_model_setup=True) 42 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") 43 | 44 | run_inference_popxl(config, tokenizer, hf_model=hf_model, sequence_length=2048) 45 | 46 | 47 | if __name__ == "__main__": 48 | try: 49 | main() 50 | except Exception as e: 51 | logging.exception(e) # Log time of exception 52 | raise 53 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import os 3 | from config import LlamaConfig 4 | 5 | import pytest 6 | 7 | from utils.simple_parsing_tools import parse_args_with_config_file 8 | 9 | 10 | def _test_config_file(): 11 | return os.path.join(os.path.dirname(__file__), "test_config.yml") 12 | 13 | 14 | @pytest.fixture 15 | def test_config_file(): 16 | return _test_config_file() 17 | 18 | 19 | @pytest.fixture 20 | def test_config(): 21 | return parse_args_with_config_file(LlamaConfig, ["--config", _test_config_file()]) 22 | 23 | 24 | # Below functions enable long tests to be skipped, unless a --long-test 25 | # cli option is specified. 26 | def pytest_addoption(parser): 27 | parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests") 28 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/execution/test_execution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | from examples_tests.test_util import SubProcessChecker 7 | 8 | root_dir = Path(__file__).parent.parent.parent.parent.resolve() 9 | 10 | 11 | def dolly_root_env_path(): 12 | env = os.environ 13 | env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir))) 14 | return env 15 | 16 | 17 | class TestExecution(SubProcessChecker): 18 | def test_inference(self): 19 | self.run_command( 20 | "python3 inference.py --config tiny --layers 2 " 21 | "--tensor_parallel 4 " 22 | "--vocab_size 128 --sequence_length 16 " 23 | "--hidden_size 128 --heads 8", 24 | root_dir, 25 | ["Duration"], 26 | env=dolly_root_env_path(), 27 | ) 28 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/layers/test_attention_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.llama import LlamaConfig as HFConfig 7 | from transformers.models.llama.modeling_llama import LlamaAttention 8 | 9 | import popxl 10 | 11 | import popxl_addons as addons 12 | from popxl_addons.patterns import apply_pre_alias_patterns 13 | from config import LlamaConfig 14 | from modelling.attention import LlamaSelfAttentionTP 15 | from popxl_addons.array_munging import repeat 16 | 17 | 18 | def test_attention_TP_cmp_huggingface(test_config: LlamaConfig): 19 | torch.manual_seed(42) 20 | 21 | batch_size = test_config.execution.micro_batch_size 22 | seq_len = test_config.model.sequence_length 23 | hidden_size = test_config.model.hidden_size 24 | intermediate_size = hidden_size * 4 25 | 26 | # HuggingFace 27 | config = HFConfig( 28 | hidden_size=hidden_size, 29 | max_position_embeddings=seq_len, 30 | intermediate_size=intermediate_size, 31 | num_attention_heads=test_config.model.attention.heads, 32 | rotary_dim=test_config.model.attention.rotary_dim, 33 | ) 34 | hf_model = LlamaAttention(config).eval() 35 | 36 | # HF forward 37 | input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True) 38 | output_, *_ = hf_model(input_t, None) 39 | output_HF = output_.detach().numpy() 40 | 41 | # TP 42 | n_shards = test_config.execution.tensor_parallel 43 | 44 | # popxl 45 | ir = popxl.Ir() 46 | ir.replication_factor = n_shards 47 | with ir.main_graph: 48 | inputs_data, inputs_host_steam, inputs_tensors = zip( 49 | *[ 50 | addons.host_load( 51 | input_t.reshape(-1, test_config.model.hidden_size), test_config.model.dtype, name="input" 52 | ), 53 | ] 54 | ) 55 | (x,) = inputs_tensors 56 | 57 | attn_args, attn_graph = LlamaSelfAttentionTP(test_config).create_graph(x) 58 | 59 | vars = attn_args.init() 60 | fwd_info = attn_graph.bind(vars).call_with_info(x) 61 | (acts,) = fwd_info.outputs 62 | 63 | fwd_d2h = addons.host_store(acts) 64 | 65 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 66 | apply_pre_alias_patterns(ir, level="default") 67 | 68 | weights = LlamaSelfAttentionTP.hf_mapping(test_config, vars, hf_model) 69 | 70 | inputs = {h2d: repeat(data, n_shards).squeeze() for h2d, data in zip(inputs_host_steam, inputs_data)} 71 | 72 | with popxl.Session(ir, "ipu_hw") as session: 73 | session.write_variables_data(weights) 74 | outputs_popxl = session.run(inputs) 75 | 76 | fwd_data = outputs_popxl[fwd_d2h] 77 | 78 | if n_shards > 1: 79 | assert len(fwd_data) == n_shards 80 | 81 | # Assert all IPU outputs are identical 82 | for i in range(1, n_shards): 83 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 84 | else: 85 | fwd_data = np.expand_dims(fwd_data, axis=0) 86 | 87 | # Assert nearly equal to HF 88 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 4) 89 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/layers/test_decoder_block_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig 7 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 8 | 9 | import popxl 10 | 11 | import popxl_addons as addons 12 | from popxl_addons.patterns import apply_pre_alias_patterns 13 | 14 | from config import LlamaConfig 15 | from modelling.decoder import LlamaDecoderBlockTP 16 | from popxl_addons.array_munging import repeat 17 | 18 | 19 | def test_decoder_block_TP_cmp_huggingface(test_config: LlamaConfig): 20 | torch.manual_seed(42) 21 | 22 | batch_size = test_config.execution.micro_batch_size 23 | seq_len = test_config.model.sequence_length 24 | hidden_size = test_config.model.hidden_size 25 | intermediate_size = hidden_size * 4 26 | 27 | # HuggingFace 28 | config = HFConfig( 29 | hidden_size=hidden_size, 30 | max_position_embeddings=seq_len, 31 | intermediate_size=intermediate_size, 32 | num_attention_heads=test_config.model.attention.heads, 33 | rotary_dim=test_config.model.attention.rotary_dim, 34 | use_parallel_residual=True, 35 | ) 36 | hf_model = LlamaDecoderLayer(config).eval() 37 | 38 | # HF forward 39 | input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True) 40 | (output_,) = hf_model(input_t) 41 | 42 | output_HF = output_.detach().numpy() 43 | 44 | # TP 45 | n_shards = test_config.execution.tensor_parallel 46 | test_config.execution.tensor_parallel = n_shards 47 | 48 | # popxl 49 | ir = popxl.Ir() 50 | ir.replication_factor = n_shards 51 | 52 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 53 | 54 | main = ir.main_graph 55 | 56 | with main: 57 | inputs_data, inputs_host_steam, inputs_tensors = zip( 58 | *[ 59 | addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"), 60 | ] 61 | ) 62 | (x,) = inputs_tensors 63 | 64 | args, graph = LlamaDecoderBlockTP(test_config).create_graph(x) 65 | 66 | ff_vars = args.init() 67 | ff = graph.bind(ff_vars) 68 | fwd_info = ff.call_with_info(x) 69 | (acts,) = fwd_info.outputs 70 | 71 | fwd_d2h = addons.host_store(acts) 72 | 73 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 74 | apply_pre_alias_patterns(ir, level="default") 75 | 76 | weights = LlamaDecoderBlockTP.hf_mapping(test_config, ff_vars, hf_model) 77 | 78 | inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)} 79 | 80 | with popxl.Session(ir, "ipu_hw") as session: 81 | session.write_variables_data(weights) 82 | outputs_popxl = session.run(inputs) 83 | 84 | fwd_data = outputs_popxl[fwd_d2h] 85 | 86 | assert len(fwd_data) == n_shards 87 | 88 | # Assert all IPU outputs are identical 89 | for i in range(1, n_shards): 90 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 91 | # Assert nearly equal to HF 92 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 93 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/layers/test_feed_forward_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig 7 | from transformers.models.llama.modeling_llama import LlamaMLP 8 | 9 | import popxl 10 | 11 | import popxl_addons as addons 12 | from popxl_addons.patterns import apply_pre_alias_patterns 13 | 14 | from config import LlamaConfig 15 | from modelling.feed_forward import LlamaFeedForwardTP 16 | from popxl_addons.array_munging import repeat 17 | 18 | 19 | def test_feed_forward_TP_cmp_huggingface(test_config: LlamaConfig): 20 | torch.manual_seed(42) 21 | 22 | batch_size = test_config.execution.micro_batch_size 23 | seq_len = test_config.model.sequence_length 24 | hidden_size = test_config.model.hidden_size 25 | intermediate_size = hidden_size * 4 26 | 27 | # HuggingFace 28 | config = HFConfig( 29 | hidden_size=hidden_size, 30 | max_position_embeddings=seq_len, 31 | intermediate_size=intermediate_size, 32 | num_attention_heads=test_config.model.attention.heads, 33 | ) 34 | hf_model = LlamaMLP(config).eval() 35 | 36 | # HF forward 37 | input_t = torch.rand((batch_size, seq_len, hidden_size)) 38 | outputs = hf_model(input_t) 39 | output_ = outputs.reshape(batch_size * seq_len, hidden_size) 40 | output_HF = output_.detach().numpy() 41 | 42 | # TP 43 | n_shards = test_config.execution.tensor_parallel 44 | 45 | # popxl 46 | ir = popxl.Ir() 47 | ir.replication_factor = n_shards 48 | 49 | main = ir.main_graph 50 | 51 | with main: 52 | inputs_data, inputs_host_steam, inputs_tensors = zip( 53 | *[ 54 | addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"), 55 | ] 56 | ) 57 | (x,) = inputs_tensors 58 | 59 | ff_args, ff_graph = LlamaFeedForwardTP(test_config).create_graph(x) 60 | 61 | ff_vars = ff_args.init() 62 | ff = ff_graph.bind(ff_vars) 63 | fwd_info = ff.call_with_info(x) 64 | (acts,) = fwd_info.outputs 65 | 66 | fwd_d2h = addons.host_store(acts) 67 | 68 | # Run `OpToIdentityPattern` among others part of `PreAliasPatterns` 69 | apply_pre_alias_patterns(ir, level="default") 70 | 71 | weights = LlamaFeedForwardTP.hf_mapping(test_config, ff_vars, hf_model) 72 | 73 | inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)} 74 | with popxl.Session(ir, "ipu_hw") as session: 75 | session.write_variables_data(weights) 76 | outputs_popxl = session.run(inputs) 77 | 78 | fwd_data = outputs_popxl[fwd_d2h] 79 | 80 | assert len(fwd_data) == n_shards 81 | 82 | # Assert all IPU outputs are identical 83 | for i in range(1, n_shards): 84 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 85 | # Assert nearly equal to HF 86 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 87 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/layers/test_lm_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | from transformers.models.llama import LlamaConfig as HFConfig 7 | from transformers.models.llama import LlamaForCausalLM 8 | 9 | import popxl 10 | from popxl.utils import to_numpy 11 | 12 | import popxl_addons as addons 13 | from popxl_addons.patterns import apply_pre_alias_patterns 14 | 15 | from config import LlamaConfig 16 | from modelling.embedding import LlamaEmbeddingsTP 17 | from modelling.llama_lm import LlamaLMHeadModelTP 18 | 19 | from popxl_addons.array_munging import shard 20 | 21 | 22 | def test_lm_TP_cmp_huggingface(test_config: LlamaConfig): 23 | torch.manual_seed(42) 24 | batch_size = test_config.execution.micro_batch_size 25 | hidden_size = test_config.model.hidden_size 26 | intermediate_size = hidden_size * 4 27 | seq_len = test_config.model.sequence_length 28 | # HuggingFace 29 | config = HFConfig( 30 | num_hidden_layers=test_config.model.layers, 31 | vocab_size=test_config.model.embedding.vocab_size, 32 | hidden_size=hidden_size, 33 | max_position_embeddings=seq_len, 34 | intermediate_size=intermediate_size, 35 | num_attention_heads=test_config.model.attention.heads, 36 | rotary_dim=test_config.model.attention.rotary_dim, 37 | ) 38 | hf_model = LlamaForCausalLM(config).eval() 39 | 40 | # HF forward 41 | input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length)) 42 | output_HF = hf_model(input_ids=input_t)[0] 43 | output_HF = output_HF.detach().numpy() 44 | 45 | # n_shards 46 | n_shards = test_config.execution.tensor_parallel 47 | 48 | # Offset inputs 49 | words_offsetted = LlamaEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t)) 50 | # popxl 51 | ir = popxl.Ir() 52 | ir.replication_factor = n_shards 53 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 54 | main = ir.main_graph 55 | 56 | with main: 57 | inputs_data, inputs_host_steam, inputs_tensors = zip( 58 | *[ 59 | addons.host_load(words_offsetted[0], popxl.int32, name="words"), 60 | ] 61 | ) 62 | (words,) = inputs_tensors 63 | facts, graph = LlamaLMHeadModelTP(test_config).create_graph(words) 64 | vars = facts.init() 65 | llm = graph.bind(vars) 66 | call_info = llm.call_with_info(words) 67 | act, *_ = call_info.outputs 68 | act_stream = addons.host_store(act) 69 | 70 | apply_pre_alias_patterns(ir, level="default") 71 | 72 | # Map weights from huggingface 73 | weights = LlamaLMHeadModelTP.hf_mapping(test_config, vars, hf_model) 74 | 75 | inputs = dict(zip(inputs_host_steam, [words_offsetted])) 76 | 77 | ir.num_host_transfers = test_config.execution.device_iterations 78 | 79 | with popxl.Session(ir, "ipu_hw") as session: 80 | session.write_variables_data(weights) 81 | outs = session.run(inputs) 82 | 83 | # Fwd output 84 | fwd_data = outs[act_stream] 85 | assert len(fwd_data) == n_shards 86 | fwd_data_full = np.concatenate(fwd_data, axis=-1)[:, : test_config.model.embedding.vocab_size] 87 | np.testing.assert_almost_equal(output_HF, fwd_data_full.reshape(output_HF.shape), 3) 88 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/integration/layers/test_model_TP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | import torch 4 | 5 | # HF 6 | 7 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig 8 | from transformers.models.llama.modeling_llama import LlamaModel 9 | 10 | import popxl 11 | from popxl.utils import to_numpy 12 | 13 | import popxl_addons as addons 14 | from popxl_addons.patterns import apply_pre_alias_patterns 15 | 16 | from config import LlamaConfig 17 | from modelling.embedding import LlamaEmbeddingsTP 18 | from modelling.llama_model import LlamaModelTP 19 | 20 | 21 | def test_model_TP_cmp_huggingface(test_config: LlamaConfig): 22 | torch.manual_seed(42) 23 | 24 | batch_size = test_config.execution.micro_batch_size 25 | hidden_size = test_config.model.hidden_size 26 | seq_len = test_config.model.sequence_length 27 | intermediate_size = hidden_size * 4 28 | # HuggingFace 29 | config = HFConfig( 30 | num_hidden_layers=test_config.model.layers, 31 | vocab_size=test_config.model.embedding.vocab_size, 32 | hidden_size=hidden_size, 33 | max_position_embeddings=seq_len, 34 | intermediate_size=intermediate_size, 35 | num_attention_heads=test_config.model.attention.heads, 36 | rotary_dim=test_config.model.attention.rotary_dim, 37 | ) 38 | hf_model = LlamaModel(config).eval() 39 | 40 | # HF forward 41 | input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length)) 42 | output_HF = hf_model(input_ids=input_t)[0] 43 | output_HF = output_HF.detach().numpy() 44 | 45 | # TP 46 | tp = test_config.execution.tensor_parallel 47 | 48 | # Offset inputs 49 | words_offsetted = LlamaEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t)) 50 | 51 | # popxl 52 | ir = popxl.Ir() 53 | ir.replication_factor = tp 54 | replica_grouping = ir.replica_grouping(stride=1, group_size=1) 55 | main = ir.main_graph 56 | 57 | with main: 58 | inputs_data, inputs_host_steam, inputs_tensors = zip( 59 | *[ 60 | addons.host_load(words_offsetted[0], popxl.int32, name="words"), 61 | ] 62 | ) 63 | (words,) = inputs_tensors 64 | facts, graph = LlamaModelTP(test_config).create_graph(words) 65 | 66 | vars = facts.init() 67 | llm = graph.bind(vars) 68 | call_info = llm.call_with_info(words) 69 | act, *_ = call_info.outputs 70 | act_stream = addons.host_store(act) 71 | 72 | apply_pre_alias_patterns(ir, level="default") 73 | 74 | # Map weights from huggingface 75 | weights = LlamaModelTP.hf_mapping(test_config, vars, hf_model) 76 | 77 | inputs = dict(zip(inputs_host_steam, [words_offsetted])) 78 | 79 | ir.num_host_transfers = test_config.execution.device_iterations 80 | 81 | with popxl.Session(ir, "ipu_hw") as session: 82 | session.write_variables_data(weights) 83 | outs = session.run(inputs) 84 | 85 | # Fwd output 86 | fwd_data = outs[act_stream] 87 | 88 | assert len(fwd_data) == tp 89 | for i in range(1, tp): 90 | np.testing.assert_equal(fwd_data[0], fwd_data[i]) 91 | 92 | np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3) 93 | -------------------------------------------------------------------------------- /llama2-chatbot/tests/test_config.yml: -------------------------------------------------------------------------------- 1 | model: 2 | sequence_length: 256 # 8 3 | embedding: 4 | vocab_size: 128 5 | hidden_size: 128 6 | layers: 2 7 | attention: 8 | heads: 4 9 | rotary_dim: 8 10 | precision: "float32" 11 | execution: 12 | micro_batch_size: 1 13 | data_parallel: 1 14 | tensor_parallel: 4 15 | -------------------------------------------------------------------------------- /llama2-chatbot/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /llama2-chatbot/utils/simple_parsing_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from examples_utils.parsing.simple_parsing_tools import * 4 | -------------------------------------------------------------------------------- /molfeat/requirements.txt: -------------------------------------------------------------------------------- 1 | pydantic<2 2 | molfeat==0.8.8 3 | transformers 4 | rdkit==2023.3.1 5 | stmol 6 | seaborn==0.12.2 7 | ipywidgets==8.0.6 8 | matplotlib 9 | numpy 10 | tabulate==0.9.0 11 | py3Dmol==2.0.1.post1 12 | torchinfo==1.7.2 13 | -------------------------------------------------------------------------------- /molfeat/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from typing import Optional 4 | import seaborn as sns 5 | 6 | from stmol import showmol 7 | import py3Dmol 8 | 9 | from rdkit import Chem 10 | from rdkit.Chem import AllChem 11 | import tabulate 12 | 13 | 14 | class Emoji: 15 | microscope = '\U0001F52C' 16 | test_tube = '\U0001F9EA' 17 | yes = '\u2705' 18 | no = '\u274C' 19 | ruler = '\U0001F4CF' 20 | warning = '\u26A0' 21 | rocket = '\U0001F680' 22 | 23 | 24 | def make_block(smi: str) -> str: 25 | mol = Chem.MolFromSmiles(smi) 26 | mol = Chem.AddHs(mol) 27 | AllChem.EmbedMolecule(mol) 28 | mblock = Chem.MolToMolBlock(mol) 29 | return mblock 30 | 31 | 32 | def render_mol(xyz: str): 33 | xyzview = py3Dmol.view() 34 | xyzview.addModel(xyz, 'mol') 35 | xyzview.setStyle({'stick': {}}) 36 | xyzview.setBackgroundColor('white') 37 | xyzview.zoomTo() 38 | showmol(xyzview, height=500, width=500) 39 | return xyzview 40 | 41 | 42 | def plot_3d_mol(smile: str): 43 | blk = make_block(smile) 44 | view = render_mol(blk) 45 | return view 46 | 47 | 48 | def report_molecule_classification(name: str, y_truth: bool, out: Optional[float], smile: str): 49 | table = [ 50 | ["Molecule:", name], 51 | ["BBBP:", f"{y_truth} (target) {Emoji.microscope}"], 52 | ] 53 | if out is not None: 54 | table.append(["Prediction:", f"{bool(out > 0.5)} {Emoji.test_tube}"]) 55 | table.append(["Correct:", f"{Emoji.yes}" if bool(out > 0.5) == y_truth else f"{Emoji.no}"]) 56 | print(tabulate.tabulate(table, tablefmt="heavy_grid")) 57 | 58 | return plot_3d_mol(smile) 59 | 60 | 61 | def report_molecule_regression(name: str, y_truth: float, out: Optional[float], smile: str, mask=None): 62 | table = [ 63 | ["Molecule:", name], 64 | ["exp:", f"{y_truth:.4f} (target) {Emoji.microscope}"], 65 | ] 66 | if out is not None: 67 | err = abs(y_truth - out) 68 | table.append(["Prediction:", f"{out:.4f} {Emoji.test_tube}"]) 69 | table.append(["|err|:", f"{err:.4f} " + (f"{Emoji.ruler}" if err < 1.5 else f"{Emoji.warning}")]) 70 | print(tabulate.tabulate(table, tablefmt="heavy_grid")) 71 | return plot_3d_mol(smile) 72 | 73 | 74 | def plot_smoothed_loss(epoch_losses: np.ndarray, window_size: int = 10): 75 | moving_avg = np.convolve(epoch_losses, np.ones(window_size) / window_size, mode='valid') 76 | moving_avg = np.clip(moving_avg, 0, None) 77 | 78 | q1, q3 = np.percentile(epoch_losses, [25, 75]) 79 | iqr = q3 - q1 80 | 81 | fig, ax = plt.subplots(figsize=(10, 5)) 82 | ax.plot(moving_avg, color='#FF6F79') 83 | ax.fill_between( 84 | range(len(moving_avg)), np.clip(moving_avg - iqr, 0, None), moving_avg + iqr, alpha=0.3, color='#FF6F79' 85 | ) 86 | 87 | ax.set_title('Smoothed Loss with IQR') 88 | ax.set_xlabel('Steps') 89 | ax.set_ylabel('Loss') 90 | 91 | plt.show() 92 | 93 | 94 | def plot_contours(test_y_true: np.ndarray, test_y_hat: np.ndarray, r2: float, mae: float): 95 | plt.style.use('seaborn') 96 | 97 | hist, xedges, yedges = np.histogram2d(test_y_true, test_y_hat, bins=10) 98 | X, Y = np.meshgrid(xedges[:-1], yedges[:-1]) 99 | Z = hist.T 100 | plt.contour(X, Y, Z, colors=None, levels=5, linewidths=1.5, alpha=0.7, cmap='viridis') 101 | 102 | plt.scatter(test_y_true, test_y_hat, alpha=0.7, edgecolors='k', linewidths=0.5) 103 | 104 | plt.gca().annotate( 105 | "$R2 = {:.2f}$\n MAE = {:.2f}".format(r2, mae), 106 | xy=(0.05, 0.9), 107 | xycoords='axes fraction', 108 | size=10, 109 | bbox=dict(boxstyle="round", fc=(1.0, 0.7, 0.7), ec="none"), 110 | ) 111 | 112 | plt.xlabel("y true") 113 | plt.ylabel("y pred") 114 | 115 | plt.show() 116 | -------------------------------------------------------------------------------- /multimodal/magma/configs/MAGMA_v1.yml: -------------------------------------------------------------------------------- 1 | { 2 | # image encoder settings 3 | encoder_name: 'clip_resnet_large', 4 | adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 4}}, 5 | freeze_img_encoder: false, 6 | 7 | # train settings 8 | batch_size: 256, 9 | train_steps: 150000, 10 | lr: 8.0e-4, 11 | min_lr: 0.0, 12 | lr_decay_iters: 300000, 13 | image_enc_lr: 2.0e-6, 14 | use_image_embed_layernorm: true, 15 | image_embed_dropout_prob: 0.1, 16 | image_size: 384, 17 | 18 | gradient_accumulation_steps: 8, 19 | zero_stage: 2, 20 | gradient_clipping: 1.0, 21 | 22 | # dataset / save / load settings 23 | train_dataset_name: 'conceptual_captions', 24 | train_dataset_dir: '/mnt/localdisk/conceptual_captions', 25 | eval_dataset_name: 'coco', 26 | eval_dataset_dir: '/mnt/localdisk/coco_data', 27 | 28 | save: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16", 29 | load: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16", 30 | 31 | eval_every: 100, 32 | 33 | } 34 | -------------------------------------------------------------------------------- /multimodal/magma/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .config import * 4 | import os 5 | from pathlib import Path 6 | 7 | CONFIG_DIR = Path(os.path.dirname(__file__)) 8 | 9 | del os, Path 10 | -------------------------------------------------------------------------------- /multimodal/magma/configs/inference.yml: -------------------------------------------------------------------------------- 1 | # -------- LM -------- 2 | gptj_6B: &gptj_6B 3 | layers: 28 4 | hidden_size: 4096 5 | sequence_length: 1024 6 | precision: 'float16' 7 | attention: 8 | heads: 16 9 | rotary_positional_embeddings_base: 10000 10 | rotary_dim: 64 11 | embedding: 12 | vocab_size: 50400 13 | real_vocab_size: 50258 14 | 15 | gptj_tiny: &gptj_tiny 16 | sequence_length: 8 17 | embedding: 18 | vocab_size: 128 19 | hidden_size: 64 20 | layers: 2 21 | attention: 22 | heads: 8 23 | rotary_dim: 8 24 | precision: "float16" 25 | 26 | # -------- VISUAL ENCODER -------- 27 | clip_resnet_large: &clip_resnet_large 28 | width: 96 29 | image_resolution: 384 30 | precision: 'float16' 31 | 32 | clip_tiny: &clip_tiny 33 | width: 36 # default 96 -> 1/8 default 34 | image_resolution: 48 # default 384 -> 1/8 default 35 | precision: 'float16' 36 | 37 | # -------- MAGMA -------- 38 | tiny: 39 | magma_v1: 40 | visual: 41 | <<: *clip_tiny 42 | execution: 43 | micro_batch_size: 1 44 | available_memory_proportion: [ 1.0 ] 45 | transformer: 46 | <<: *gptj_tiny 47 | ff_adapter: 48 | mode: 'normal' 49 | downsample_factor: 4 50 | execution: 51 | micro_batch_size: 1 52 | attention_serialisation: 1 53 | tensor_parallel: 2 54 | 55 | release: 56 | "magma_v1_1024": 57 | seed: 0 58 | visual: 59 | <<: *clip_resnet_large 60 | execution: 61 | micro_batch_size: 1 62 | available_memory_proportion: [ 1.0 ] 63 | transformer: 64 | <<: *gptj_6B 65 | ff_adapter: 66 | mode: 'normal' 67 | downsample_factor: 4 68 | execution: 69 | available_memory_proportion: [ 0.45 ] 70 | tensor_parallel: 4 71 | micro_batch_size: 1 72 | attention_serialisation: 1 73 | 74 | "magma_v1_500": 75 | seed: 0 76 | visual: 77 | <<: *clip_resnet_large 78 | transformer: 79 | <<: *gptj_6B 80 | sequence_length: 500 81 | execution: 82 | available_memory_proportion: [ 0.45 ] 83 | tensor_parallel: 4 84 | micro_batch_size: 1 85 | attention_serialisation: 1 86 | ff_adapter: 87 | mode: 'normal' 88 | downsample_factor: 4 89 | -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/cantaloupe_popsicle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/cantaloupe_popsicle.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/circles.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/circles.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/circles_square.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/circles_square.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/korea.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/korea.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/matterhorn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/matterhorn.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/mushroom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/mushroom.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/people.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/people.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/playarea.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/playarea.jpg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/popsicle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/popsicle.png -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/rainbow_popsicle.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/rainbow_popsicle.jpeg -------------------------------------------------------------------------------- /multimodal/magma/demo_example_images/table_tennis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/table_tennis.jpg -------------------------------------------------------------------------------- /multimodal/magma/images/MagmaStructure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/images/MagmaStructure.png -------------------------------------------------------------------------------- /multimodal/magma/images/demo_magma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/images/demo_magma.png -------------------------------------------------------------------------------- /multimodal/magma/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .clip_resnet import * 4 | from .gptj import * 5 | from .adapters_TP import * 6 | from .image_prefix import * 7 | from .magma_mapping import * 8 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/clip_resnet/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .batch_norm import * 4 | from .bottleneck import * 5 | from .stem import * 6 | from .modified_resnet import * 7 | from .attention_pool import * 8 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/clip_resnet/batch_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | import numpy as np 4 | from functools import partial 5 | 6 | import popxl 7 | import popxl.ops as ops 8 | import popxl_addons as addons 9 | 10 | __all__ = ["BatchNorm2D"] 11 | 12 | 13 | class BatchNorm2D(addons.Module): 14 | def __init__(self, epsilon: float = 1e-5, momentum: float = 0.9): 15 | """ 16 | Implements Batch Normalization (only for inference) 17 | """ 18 | super().__init__() 19 | self.epsilon = epsilon 20 | self.momentum = momentum # Not used in inference; the default used is consistent with ONNX 21 | 22 | def build(self, x: popxl.Tensor) -> popxl.Tensor: 23 | 24 | shape = (x.shape[1],) 25 | 26 | self.weight = self.add_variable_input( 27 | "weight", 28 | partial(np.ones, shape), 29 | x.dtype, 30 | ) 31 | 32 | self.bias = self.add_variable_input( 33 | "bias", 34 | partial(np.zeros, shape), 35 | x.dtype, 36 | ) 37 | 38 | self.running_mean = self.add_variable_input( 39 | "running_mean", 40 | partial(np.zeros, shape), 41 | x.dtype, 42 | ) 43 | 44 | self.running_var = self.add_variable_input( 45 | "running_var", 46 | partial(np.ones, shape), 47 | x.dtype, 48 | ) 49 | 50 | y = ops.batch_norm_inference( 51 | x, 52 | scale=self.weight, 53 | bias=self.bias, 54 | mean=self.running_mean, 55 | var=self.running_var, 56 | epsilon=self.epsilon, 57 | momentum=self.momentum, 58 | ) 59 | 60 | return y 61 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/clip_resnet/stem.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | import numpy as np 4 | 5 | from popxl_addons import Module, NamedTensors 6 | from popxl_addons.layers import Conv2D 7 | import popxl 8 | from popxl import Tensor 9 | from popxl import ops 10 | from popxl.utils import to_numpy 11 | from clip.model import ModifiedResNet as ClipModifiedResNet 12 | from configs import ResNetConfig 13 | from .batch_norm import BatchNorm2D 14 | 15 | __all__ = ["Stem"] 16 | 17 | 18 | class Stem(Module): 19 | def __init__(self, config: ResNetConfig): 20 | """ 21 | Stem block of CLIP ModifiedResNet. 22 | Inference only: batch normalisation layers can work with a baked running mean and running vars, 23 | but these values won't be updated. 24 | """ 25 | super().__init__() 26 | self.config = config 27 | self.conv1 = Conv2D(self.config.width // 2, kernel_size=3, strides=(2, 2), paddings=(1, 1, 1, 1), bias=False) 28 | self.bn1 = BatchNorm2D() 29 | self.conv2 = Conv2D(self.config.width // 2, kernel_size=3, paddings=(1, 1, 1, 1), bias=False) 30 | self.bn2 = BatchNorm2D() 31 | self.conv3 = Conv2D(self.config.width, kernel_size=3, paddings=(1, 1, 1, 1), bias=False) 32 | self.bn3 = BatchNorm2D() 33 | 34 | def build(self, x: popxl.Tensor) -> popxl.Tensor: 35 | x = self.conv1(x) 36 | x = self.bn1(x) 37 | x = self.conv2(ops.relu(x)) 38 | x = self.bn2(x) 39 | x = self.conv3(ops.relu(x)) 40 | x = self.bn3(x) 41 | # NOTE: average pool in pytorch has stride default value = kernel size. 42 | # this is different in popxl so we need to set all parameters 43 | x = ops.average_pool(ops.relu(x), kernel_size=(2, 2), stride=(2, 2)) 44 | return x 45 | 46 | @staticmethod 47 | def clip_mapping(clip_model: ClipModifiedResNet, variables: NamedTensors): 48 | state_dict = { 49 | variables.conv1.weight: to_numpy(clip_model.conv1.weight.data), 50 | variables.bn1.weight: to_numpy(clip_model.bn1.weight.data), 51 | variables.bn1.bias: to_numpy(clip_model.bn1.bias.data), 52 | variables.bn1.running_mean: to_numpy(clip_model.bn1.running_mean.data), 53 | variables.bn1.running_var: to_numpy(clip_model.bn1.running_var.data), 54 | variables.conv2.weight: to_numpy(clip_model.conv2.weight.data), 55 | variables.bn2.weight: to_numpy(clip_model.bn2.weight.data), 56 | variables.bn2.bias: to_numpy(clip_model.bn2.bias.data), 57 | variables.bn2.running_mean: to_numpy(clip_model.bn2.running_mean.data), 58 | variables.bn2.running_var: to_numpy(clip_model.bn2.running_var.data), 59 | variables.conv3.weight: to_numpy(clip_model.conv3.weight.data), 60 | variables.bn3.weight: to_numpy(clip_model.bn3.weight.data), 61 | variables.bn3.bias: to_numpy(clip_model.bn3.bias.data), 62 | variables.bn3.running_mean: to_numpy(clip_model.bn3.running_mean.data), 63 | variables.bn3.running_var: to_numpy(clip_model.bn3.running_var.data), 64 | } 65 | return state_dict 66 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/gptj/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from .attention import * 4 | from .decoder import * 5 | from .embedding import * 6 | from .feed_forward import * 7 | from .gptj_lm import * 8 | from .gptj_model import * 9 | from .finetuneanon_mapping import * 10 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/gptj/feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from typing import Optional, List, Dict 3 | import torch 4 | import popxl 5 | from popxl import ops 6 | from popxl.utils import to_numpy 7 | from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoMLP as HFModel 8 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF 9 | 10 | import popxl_addons as addons 11 | from configs import GPTJConfig 12 | from popxl_addons.named_tensors import NamedTensorData 13 | from popxl_addons.layers import Linear, LayerNorm 14 | import numpy as np 15 | 16 | from popxl_addons.named_tensors import NamedTensors 17 | from popxl_addons.ops.replicated_all_reduce_TP import ( 18 | replicated_all_reduce_identical_inputs, 19 | replicated_all_reduce_identical_grad_inputs, 20 | ) 21 | from popxl_addons.array_munging import shard 22 | 23 | 24 | class GPTJFeedForwardTP(addons.Module): 25 | def __init__(self, config: GPTJConfig, ff_size: Optional[int] = None): 26 | super().__init__() 27 | self.config = config 28 | tp = config.execution.tensor_parallel 29 | self.n_shards = tp 30 | self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=1) 31 | # Also known as the intermediate size 32 | self.ff_size = 4 * config.hidden_size if ff_size is None else ff_size 33 | assert self.ff_size % self.n_shards == 0 34 | # ----- Layers ----- 35 | # Sharded across devices - column wise 36 | self.intermediate = Linear(self.ff_size // self.n_shards, replica_grouping=self.replica_grouping) 37 | 38 | # Sharded across devices - row wise (bias applied separately) 39 | self.output = Linear(config.hidden_size, bias=False, replica_grouping=self.replica_grouping) 40 | 41 | def build(self, x: popxl.Tensor) -> List[popxl.Tensor]: 42 | """Identical input (x,) and identical output across shards.""" 43 | # ----- Identical computation ----- 44 | z = replicated_all_reduce_identical_inputs(x, group=self.replica_grouping.transpose()) 45 | 46 | # ----- Sharded computation ----- 47 | 48 | # Shard column-wise since gelu is not linear. 49 | # Indeed, sharding row wise requires a sum AllReduce at the end, 50 | # but gelu is not linear: gelu(x+y) != gelu(x) + gelu(y) 51 | z = self.intermediate(z) 52 | z = ops.gelu(z) 53 | # Here, x is already sharded across devices. Since we don't have non linearities, 54 | # we can shard row-wise (which requires both X and the weight matrix to be sharded) 55 | # and then perform an all reduce 56 | z = self.output(z) 57 | 58 | z = replicated_all_reduce_identical_grad_inputs(z, group=self.replica_grouping.transpose()) 59 | 60 | # ----- Identical computation ----- 61 | 62 | # Output linear layer bias (identical bias on all devices) 63 | self.bias = self.add_variable_input("bias", lambda: np.zeros(z.shape[-1]), z.dtype) 64 | z = z + self.bias 65 | 66 | return z 67 | 68 | @staticmethod 69 | def finetuneanon_mapping( 70 | config: GPTJConfig, variables: NamedTensors, hf_model: HFModel 71 | ) -> Dict[popxl.Tensor, np.ndarray]: 72 | dtype = config.dtype 73 | n_shards = config.execution.tensor_parallel 74 | 75 | return { 76 | variables.intermediate.weight: shard(to_numpy(hf_model.c_fc.weight.data.T, dtype), n_shards, axis=-1), 77 | variables.intermediate.bias: shard(to_numpy(hf_model.c_fc.bias.data, dtype), n_shards, axis=-1), 78 | variables.output.weight: shard(to_numpy(hf_model.c_proj.weight.data.T, dtype), n_shards, axis=0), 79 | variables.bias: to_numpy(hf_model.c_proj.bias.data, dtype), 80 | } 81 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/gptj/finetuneanon_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from typing import Dict 4 | import numpy as np 5 | 6 | from transformers.models.gpt_neo import GPTNeoForCausalLM as HFLMHeadModel 7 | from transformers.models.gpt_neo import GPTNeoModel as HFModel 8 | 9 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF 10 | 11 | import popxl 12 | from popxl_addons import TaskSession 13 | 14 | from configs import GPTJConfig 15 | from modelling.gptj.gptj_model import GPTJModelTP 16 | 17 | 18 | def finetuneanon_mapping_lm_tp( 19 | config: GPTJConfig, session: TaskSession, pretrained: HFLMHeadModel 20 | ) -> Dict[popxl.Tensor, np.ndarray]: 21 | weights = GPTJLMHeadModelTP.finetuneanon_mapping(config, session.state.fwd, pretrained) 22 | return weights 23 | 24 | 25 | def finetuneanon_mapping_tp( 26 | config: GPTJConfig, session: TaskSession, pretrained: HFModel 27 | ) -> Dict[popxl.Tensor, np.ndarray]: 28 | weights = GPTJModelTP.finetuneanon_mapping(config, session.state.fwd, pretrained) 29 | return weights 30 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/gptj/gptj_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | import numpy as np 3 | from typing import Dict 4 | from configs import GPTJConfig 5 | import torch 6 | 7 | import popxl 8 | from popxl.utils import to_numpy 9 | 10 | import popxl_addons as addons 11 | from popxl_addons import NamedTensors 12 | from popxl_addons.named_tensors import NamedTensorData 13 | 14 | from popxl_addons.layers import LayerNorm 15 | 16 | from .embedding import GPTJEmbeddingsTP 17 | from .decoder import GPTJDecoderTP, GPTJDecoderBlockTP 18 | 19 | from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoModel as HFModel 20 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF 21 | 22 | 23 | class GPTJModelTP(addons.Module): 24 | def __init__(self, config: GPTJConfig, include_layer_norm=True): 25 | super().__init__() 26 | self.config = config 27 | # sharded, then last bit identical 28 | self.embeddings = GPTJEmbeddingsTP(self.config) 29 | # identical inputs, then sharded, then identical 30 | self.decoder = GPTJDecoderTP(self.config) 31 | # identical 32 | self.include_layer_norm = include_layer_norm 33 | if self.include_layer_norm: 34 | self.ln_f = LayerNorm() 35 | 36 | def build(self, input_ids: popxl.Tensor): 37 | x = self.embeddings(input_ids) 38 | x = self.decoder(x) 39 | if self.include_layer_norm: 40 | x = self.ln_f(x) 41 | return x 42 | 43 | @staticmethod 44 | def finetuneanon_mapping( 45 | config: GPTJConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True, from_magma: bool = True 46 | ) -> Dict[popxl.Tensor, np.ndarray]: 47 | dtype = config.dtype 48 | weights = {} 49 | if layer_norm: 50 | weights = { 51 | variables.ln_f.weight: to_numpy(hf_model.ln_f.weight.data, dtype), 52 | variables.ln_f.bias: to_numpy(hf_model.ln_f.bias.data, dtype), 53 | } 54 | 55 | weights.update(GPTJEmbeddingsTP.finetuneanon_mapping(config, variables.embeddings, hf_model)) 56 | 57 | for l in range(config.layers): 58 | weights.update( 59 | GPTJDecoderBlockTP.finetuneanon_mapping( 60 | config, variables.decoder[l], hf_model.h[l], from_magma=from_magma 61 | ) 62 | ) 63 | 64 | return weights 65 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/image_prefix.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from popxl_addons import Module, NamedTensors 3 | from popxl_addons.layers import Linear, LayerNorm 4 | import popxl 5 | from popxl import Tensor 6 | from popxl import ops 7 | from popxl.utils import to_numpy 8 | 9 | from configs import MagmaConfig 10 | from modelling.clip_resnet.modified_resnet import ModifiedResNet 11 | import numpy as np 12 | 13 | 14 | class ImagePrefix(Module): 15 | def __init__(self, config: MagmaConfig): 16 | """ 17 | Takes in a batch of images and returns a batch of embeddings of the 18 | same dimensions as the LM's word embeddings. 19 | """ 20 | super().__init__() 21 | self.config = config 22 | proj_out_dim = self.config.transformer.hidden_size 23 | # project to the language model hidden_dim 24 | self.proj = Linear(proj_out_dim) 25 | self.enc = ModifiedResNet(config.visual, pool=False) 26 | self.ln = LayerNorm() 27 | 28 | def build(self, x: popxl.Tensor) -> popxl.Tensor: 29 | # pass through image encoder 30 | #: (b, channels, h, w) 31 | embed = self.enc(x) 32 | #: b h w d 33 | assert len(embed.shape) == 4 34 | if embed.shape[1] == 1: 35 | embed = ops.squeeze(embed, [1, 2]) 36 | else: 37 | #: b (h w) d 38 | embed = embed.reshape((*embed.shape[:2], embed.shape[2] * embed.shape[3])).transpose((0, 2, 1)) 39 | #: b (h w) d -> b ( h w ) proj_out_dim = b ( h w ) lm_hidden_size 40 | embed = self.proj(embed) 41 | bs, hw, hidden = embed.shape 42 | embed = embed.reshape((bs * hw, hidden)) 43 | embed = self.ln(embed) 44 | embed = embed.reshape((bs, hw, hidden)) 45 | return embed 46 | 47 | @staticmethod 48 | def magma_mapping(magma_model, config, variables: NamedTensors): 49 | state_dict = ModifiedResNet.clip_mapping(magma_model.enc, config.visual, variables.enc, False) 50 | state_dict.update( 51 | { 52 | variables.ln.weight: to_numpy(magma_model.ln.weight), 53 | variables.ln.bias: to_numpy(magma_model.ln.bias), 54 | variables.proj.weight: to_numpy(magma_model.proj.weight.T), 55 | variables.proj.bias: to_numpy(magma_model.proj.bias), 56 | } 57 | ) 58 | return state_dict 59 | -------------------------------------------------------------------------------- /multimodal/magma/modelling/magma_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from typing import Dict 4 | import numpy as np 5 | from torch import nn 6 | 7 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig 8 | 9 | import popxl 10 | from popxl_addons import TaskSession 11 | 12 | from configs import MagmaConfig, GPTJConfig, CONFIG_DIR 13 | from modelling.image_prefix import ImagePrefix 14 | from modelling.gptj.gptj_lm import GPTJLMHeadModelTP 15 | 16 | from magma.image_encoders import clip_encoder 17 | from magma.magma import Magma 18 | import os 19 | 20 | 21 | def load_magma(path, config: MagmaConfig, check_config: bool = True) -> nn.Module: 22 | """ 23 | Loads magma checkpoint. 24 | """ 25 | model = Magma.from_checkpoint( 26 | config_path=os.path.join(CONFIG_DIR, "MAGMA_v1.yml"), 27 | checkpoint_path=path, 28 | device="cpu", 29 | ) 30 | if config.visual.precision == "float16": 31 | model.image_prefix.half() 32 | if config.transformer.precision == "float16": 33 | model.lm.half() 34 | 35 | if check_config: 36 | finetuneanon_lm_config_check(config.transformer, model.lm.config) 37 | 38 | return model 39 | 40 | 41 | def magma_mapping(config: MagmaConfig, session: TaskSession, magma: nn.Module) -> Dict[popxl.Tensor, np.ndarray]: 42 | weights = ImagePrefix.magma_mapping(magma.image_prefix, config, session.state.fwd.image_prefix) 43 | weights.update(GPTJLMHeadModelTP.finetuneanon_mapping(config.transformer, session.state.fwd, magma.lm)) 44 | return weights 45 | 46 | 47 | def finetuneanon_lm_config_check(config: GPTJConfig, finetuneanon_config: GPTNeoConfig): 48 | """ 49 | Compare a GPTJConfig with a finetuneanon GPTNeoConfig config and ensure they match. 50 | Required if loading a pre-trained model 51 | """ 52 | if finetuneanon_config.jax == False: 53 | raise ValueError( 54 | "GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only with jax=True" 55 | ) 56 | if finetuneanon_config.rotary == False: 57 | raise ValueError( 58 | "GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if rotary embedding is used" 59 | ) 60 | for attn in finetuneanon_config.attention_layers: 61 | if attn != "global": 62 | raise ValueError( 63 | 'GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if "global" attention is used' 64 | ) 65 | attn_type = finetuneanon_config.attention_types[0][0] 66 | if attn_type != "global": 67 | raise ValueError( 68 | 'GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if "global" attention is used' 69 | ) 70 | 71 | params = [ 72 | ("hidden_size", config.hidden_size, finetuneanon_config.hidden_size), 73 | ("heads", config.attention.heads, finetuneanon_config.num_heads), 74 | ("layers", config.layers, finetuneanon_config.num_layers), 75 | ("vocab_size", config.embedding.real_vocab_size, finetuneanon_config.vocab_size), 76 | ("rotary_dim", config.attention.rotary_dim, finetuneanon_config.rotary_dim), 77 | ] 78 | 79 | if not all(xl == hf for _, xl, hf in params): 80 | not_eq_str = ", ".join(f"\n`{name}` not equal, config: {xl}, hf: {hf}" for name, xl, hf in params if xl != hf) 81 | raise ValueError( 82 | f"Config does not match the GPTNeo pre-trained model from https://github.com/finetuneanon/transformers. Not matching: {not_eq_str}" 83 | ) 84 | -------------------------------------------------------------------------------- /multimodal/magma/requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/torch_stable.html 2 | 3 | numpy 4 | torch==2.0.1+cpu 5 | #examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@7cd37a8eccabe88e3741eef2c31bafd4fcd30c4c 6 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@v3.3 7 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.1 8 | pyyaml==5.4.1 9 | dataclasses 10 | typeguard==2.13.3 11 | scipy~=1.10.1 12 | 13 | pytest==6.2.5 14 | pytest-pythonpath==0.7.4 15 | 16 | jupyter 17 | ipywidgets 18 | 19 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0 20 | git+https://github.com/Aleph-Alpha/magma@4d01e5172115ab4a8f4b4bf8da76dbc08b6cf36c 21 | -------------------------------------------------------------------------------- /multimodal/magma/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /multimodal/magma/utils/sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | # Copyright (c) 2022 Aleph Alpha GmbH 3 | 4 | from magma.sampling import top_k_filter, top_p_filter 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | 9 | def generate(logits: torch.Tensor, top_k: float = 0.0, top_p: float = 0.9, temperature: float = 0.7): 10 | # taken from https://github.com/Aleph-Alpha/magma/blob/master/magma/sampling.py 11 | if temperature == 0.0: 12 | next_token = torch.argmax(logits, dim=-1, keepdims=True) 13 | else: 14 | if top_k > 0: 15 | logits = top_k_filter(logits, k=top_k) 16 | if top_p > 0: 17 | logits = top_p_filter(logits, threshold=top_p) 18 | 19 | probs = F.softmax(logits / temperature, dim=-1) 20 | next_token = torch.multinomial(probs, num_samples=1) 21 | return next_token 22 | -------------------------------------------------------------------------------- /multimodal/magma/utils/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | import argparse 4 | import atexit 5 | import logging 6 | import os 7 | import random 8 | import tempfile 9 | from argparse import ArgumentParser 10 | from pathlib import Path 11 | from typing import Optional, Callable, Tuple, Union, List 12 | 13 | import numpy as np 14 | import popart 15 | import torch 16 | 17 | from popxl_addons import GIT_COMMIT as ADDONS_GIT_COMMIT 18 | from popxl_addons.utils import timer 19 | 20 | from configs import MagmaConfig 21 | from utils.simple_parsing_tools import parse_args_with_presets 22 | import popdist 23 | import sys 24 | 25 | __all__ = ["set_random_seeds", "magma_config_setup"] 26 | 27 | 28 | def set_random_seeds(seed: int) -> None: 29 | """ 30 | Initialise seeds on host (numpy, torch, random) 31 | to guarantee deterministic results 32 | """ 33 | np.random.seed(seed) 34 | torch.manual_seed(seed) 35 | random.seed(seed) 36 | 37 | 38 | def magma_config_setup( 39 | config_file: Union[str, Path], 40 | presets_key: str, 41 | default: str, 42 | CLI_args: Optional[str] = None, 43 | ) -> Tuple[MagmaConfig, argparse.Namespace]: 44 | """Parse command line args and setup random seed, W&B, logging 45 | Args: 46 | config_file: Path to config file (yaml) 47 | presets_key: Which key in the config to use 48 | default: Default model config 49 | CLI_args: Extra command line arguments to customise configuration 50 | 51 | Returns: 52 | MagmaConfig, argparse namespace and optional pretrained model 53 | """ 54 | 55 | def custom_args(parser: ArgumentParser): 56 | log_level = os.environ.get("APP_LOG_LEVEL", "INFO") 57 | parser.add_argument( 58 | "--log_level", 59 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 60 | type=str, 61 | default=log_level, 62 | help=("Logging level for the app. " "Can also be set using the environment variable `APP_LOG_LEVEL`"), 63 | ) 64 | # needed for jupyter notebooks 65 | parser.add_argument("-f", type=str, default="", help=f"jupyter") 66 | 67 | config, args = parse_args_with_presets(MagmaConfig, config_file, presets_key, default, custom_args, CLI_args) 68 | config: MagmaConfig # type: ignore 69 | config.validate() 70 | 71 | set_random_seeds(config.seed) 72 | 73 | logging_setup(args, config) 74 | 75 | return config, args 76 | 77 | 78 | def logging_setup(args, config): 79 | """Setup logging""" 80 | logging.basicConfig( 81 | level=args.log_level, 82 | format="%(asctime)s %(levelname)s: %(message)s", 83 | datefmt="%Y-%m-%d %H:%M:%S", 84 | stream=sys.stdout, 85 | ) 86 | logging.info(f"Starting. Process id: {os.getpid()}") 87 | logging.info(f"Config: {config}") 88 | -------------------------------------------------------------------------------- /multimodal/magma/utils/simple_parsing_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from examples_utils.parsing.simple_parsing_tools import * 4 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | 3 | from utils.trainer import T5Trainer 4 | from utils.pipeline import T5Pipeline 5 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .config import T5Config, Execution 3 | import os 4 | from pathlib import Path 5 | 6 | CONFIG_DIR = Path(os.path.dirname(__file__)) 7 | 8 | del os, Path 9 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/config/finetuning.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | xxl: &xxl 3 | model: 4 | sequence_length: 512 5 | embedding: 6 | vocab_size: 32128 7 | hidden_size: 4096 8 | d_ff: 10240 9 | layers: 24 10 | attention: 11 | heads: 64 12 | d_kv: 64 13 | training: 14 | global_batch_size: 128 15 | steps: 500 16 | optimizer: 17 | name: adamw 18 | learning_rate: 19 | maximum: 5e-6 20 | warmup_steps: 10 21 | weight_decay: 0.0 22 | 23 | xl: &xl 24 | model: 25 | sequence_length: 512 26 | embedding: 27 | vocab_size: 32128 28 | hidden_size: 2048 29 | d_ff: 5120 30 | layers: 24 31 | attention: 32 | heads: 32 33 | d_kv: 64 34 | training: 35 | global_batch_size: 128 36 | steps: 500 37 | optimizer: 38 | name: adamw 39 | learning_rate: 40 | maximum: 5e-6 41 | warmup_steps: 10 42 | weight_decay: 0.01 43 | 44 | tiny: &tiny 45 | model: 46 | sequence_length: 512 47 | embedding: 48 | vocab_size: 128 49 | hidden_size: 64 50 | d_ff: 256 51 | layers: 4 52 | attention: 53 | heads: 4 54 | d_kv: 16 55 | training: 56 | global_batch_size: 16 57 | steps: 10 58 | optimizer: 59 | name: adamw 60 | learning_rate: 61 | maximum: 1e-5 62 | warmup_steps: 0 63 | weight_decay: 0.01 64 | 65 | # ------------------------- 66 | 67 | 68 | # ------- Execution ------- 69 | release: 70 | xxl_pod64: 71 | <<: *xxl 72 | execution: 73 | io_tiles: 128 74 | micro_batch_size: 1 75 | loss_scaling: 1 76 | data_parallel: 4 77 | tensor_parallel: 16 78 | available_memory_proportion: [ 0.2 ] 79 | 80 | xxl_pod16: 81 | <<: *xxl 82 | execution: 83 | io_tiles: 128 84 | micro_batch_size: 1 85 | loss_scaling: 1 86 | data_parallel: 1 87 | tensor_parallel: 16 88 | available_memory_proportion: [ 0.2 ] 89 | 90 | xl_pod16: 91 | <<: *xl 92 | execution: 93 | io_tiles: 128 94 | micro_batch_size: 1 95 | loss_scaling: 1 96 | data_parallel: 2 97 | tensor_parallel: 8 98 | available_memory_proportion: [ 0.2 ] 99 | 100 | xl_pod8: 101 | <<: *xl 102 | execution: 103 | io_tiles: 128 104 | micro_batch_size: 1 105 | loss_scaling: 1 106 | data_parallel: 1 107 | tensor_parallel: 8 108 | available_memory_proportion: [ 0.2 ] 109 | 110 | tiny: 111 | <<: *tiny 112 | execution: 113 | io_tiles: 64 114 | micro_batch_size: 1 115 | data_parallel: 2 116 | tensor_parallel: 2 117 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/config/inference.yml: -------------------------------------------------------------------------------- 1 | # -------- Models -------- 2 | xxl: &xxl 3 | model: 4 | eval: true 5 | sequence_length: 512 6 | embedding: 7 | vocab_size: 32128 8 | hidden_size: 4096 9 | d_ff: 10240 10 | layers: 24 11 | attention: 12 | heads: 64 13 | d_kv: 64 14 | 15 | xl: &xl 16 | model: 17 | eval: true 18 | sequence_length: 512 19 | embedding: 20 | vocab_size: 32128 21 | hidden_size: 2048 22 | d_ff: 5120 23 | layers: 24 24 | attention: 25 | heads: 32 26 | d_kv: 64 27 | 28 | tiny: &tiny 29 | model: 30 | eval: true 31 | sequence_length: 512 32 | embedding: 33 | vocab_size: 128 34 | hidden_size: 64 35 | d_ff: 256 36 | layers: 4 37 | attention: 38 | heads: 4 39 | d_kv: 16 40 | 41 | # ------------------------- 42 | 43 | # ------- Execution ------- 44 | release: 45 | xxl-mnli: 46 | <<: *xxl 47 | execution: 48 | micro_batch_size: 20 49 | available_memory_proportion: [ 0.4 ] 50 | tensor_parallel: 16 51 | 52 | xxl: 53 | <<: *xxl 54 | execution: 55 | micro_batch_size: 12 56 | available_memory_proportion: [ 0.4 ] 57 | tensor_parallel: 16 58 | 59 | xl-mnli: 60 | <<: *xl 61 | execution: 62 | micro_batch_size: 24 63 | available_memory_proportion: [ 0.4 ] 64 | tensor_parallel: 8 65 | 66 | xl: 67 | <<: *xl 68 | execution: 69 | micro_batch_size: 16 70 | available_memory_proportion: [ 0.4 ] 71 | tensor_parallel: 8 72 | 73 | tiny: 74 | <<: *tiny 75 | execution: 76 | micro_batch_size: 2 77 | available_memory_proportion: [ 0.4 ] 78 | tensor_parallel: 2 79 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/imgs/mnli_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/doing-more-with-flan-t5/imgs/mnli_dataset.png -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/modelling/hf_mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from typing import Dict 3 | import numpy as np 4 | 5 | from transformers.models.t5 import T5Model as HFModel 6 | from transformers.models.t5 import T5ForConditionalGeneration as HFLMHeadModel 7 | 8 | import popxl 9 | from popxl_addons import TaskSession 10 | 11 | from config import T5Config 12 | from modelling.t5_model import T5ModelTP 13 | from modelling.t5_lm import T5LMHeadModelTP 14 | 15 | 16 | def hf_mapping_lm_tp( 17 | config: T5Config, session: TaskSession, pretrained: HFLMHeadModel 18 | ) -> Dict[popxl.Tensor, np.ndarray]: 19 | load_to = session.state 20 | if "fwd" in session.state: 21 | load_to = session.state.fwd 22 | weights = T5LMHeadModelTP.hf_mapping(config, load_to, pretrained) 23 | return weights 24 | 25 | 26 | def hf_mapping_TP(config: T5Config, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 27 | load_to = session.state 28 | if "fwd" in session.state: 29 | load_to = session.state.fwd 30 | weights = T5ModelTP.hf_mapping(config, load_to, pretrained) 31 | return weights 32 | 33 | 34 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel: 35 | weights = session.get_named_tensors_data() 36 | if "fwd" in weights: 37 | weights = weights.fwd 38 | state_dict = T5LMHeadModelTP.to_hf(weights, hf_model) 39 | hf_model.load_state_dict(state_dict) 40 | return hf_model 41 | 42 | 43 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel: 44 | weights = session.get_named_tensors_data() 45 | if "fwd" in weights: 46 | weights = weights.fwd 47 | state_dict = T5ModelTP.to_hf(weights, hf_model) 48 | hf_model.load_state_dict(state_dict) 49 | return hf_model 50 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/modelling/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from functools import partial 3 | from typing import Dict 4 | import torch 5 | import popxl 6 | from popxl import ops 7 | from popxl.utils import to_numpy 8 | from transformers.models.t5.modeling_t5 import T5LayerNorm as HFModel 9 | from transformers.models.t5.configuration_t5 import T5Config as T5ConfigHF 10 | 11 | import popxl_addons as addons 12 | from config import T5Config 13 | from popxl_addons.named_tensors import NamedTensorData 14 | import numpy as np 15 | 16 | from popxl_addons.named_tensors import NamedTensors 17 | 18 | 19 | class T5LayerNorm(addons.Module): 20 | def __init__(self, config: T5Config): 21 | super().__init__() 22 | self.eps = config.model.eps 23 | self.dtype = config.model.dtype 24 | 25 | def build(self, x: popxl.Tensor) -> popxl.Tensor: 26 | """ 27 | Build layer normalisation for T5. No bias and no subtraction of mean. 28 | """ 29 | w = self.add_variable_input("weight", partial(np.ones, x.shape[-1]), self.dtype) 30 | 31 | # Perform the computation in float32 32 | if x.dtype == popxl.float16: 33 | x = ops.cast(x, popxl.float32) 34 | variance = ops.mean(x * x, -1, keepdims=True) 35 | x = x / ops.sqrt(variance + self.eps) 36 | 37 | # Cast back down to float16 if needed 38 | if w.dtype == popxl.float16: 39 | x = ops.cast(x, popxl.float16) 40 | 41 | x = x * w 42 | return x 43 | 44 | @staticmethod 45 | def hf_mapping(config: T5Config, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]: 46 | dtype = config.model.dtype 47 | weights = { 48 | variables.weight: to_numpy(hf_model.weight.data, dtype), 49 | } 50 | return weights 51 | 52 | @staticmethod 53 | def to_hf(config: T5ConfigHF, popxl_state_dict: NamedTensorData, hf_model: HFModel) -> Dict[str, torch.Tensor]: 54 | state_dict = {} 55 | state_dict["weight"] = torch.tensor(popxl_state_dict.weight, dtype=config.torch_dtype) 56 | return state_dict 57 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | pytest-pythonpath==0.7.4 3 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/cpu/torch_stable.html 2 | 3 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 4 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@latest_stable 5 | pyyaml==5.4.1 6 | dataclasses==0.8; python_version < '3.7' 7 | transformers==4.25.1 8 | datasets 9 | evaluate==0.4.0 10 | tfrecord==1.14.1 11 | torch==2.0.1+cpu 12 | numpy 13 | scipy>=1.5.4 14 | more-itertools==8.13.0 15 | wandb==0.12.8 16 | scikit-learn 17 | 18 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0 19 | 20 | protobuf==3.20.*; python_version > '3.6' 21 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from .simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/utils/simple_parsing_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from examples_utils.parsing.simple_parsing_tools import * 3 | -------------------------------------------------------------------------------- /natural-language-processing/doing-more-with-flan-t5/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved. 2 | from typing import Dict 3 | 4 | import numpy as np 5 | import time 6 | 7 | 8 | def _linear_schedule(start: int, end: int, interval: int, low: float, high: float) -> Dict[int, float]: 9 | update_steps = np.arange(start, end + 1, interval).astype(np.uint32) 10 | updates = np.linspace(low, high, len(update_steps)) 11 | return dict(zip(update_steps, updates)) 12 | 13 | 14 | def warmup_schedule(total_steps: int, minimum: float, maximum: float, warmup_steps: int = 0) -> Dict[int, float]: 15 | """Learning rate schedule with linear warm up and then remains at max. 16 | 17 | Linearly increase from `minimum` to `maximum` for `warmup_steps` steps. 18 | Then constant at the `maximum` learning rate for the remaining steps. 19 | 20 | Returns a dict that maps step to learning rate. 21 | """ 22 | schedule = {} 23 | if warmup_steps > 0: 24 | schedule.update(_linear_schedule(0, warmup_steps, 1, minimum, maximum)) 25 | 26 | schedule.update(_linear_schedule(warmup_steps, total_steps, 1, maximum, maximum)) # maximum to maximum so constant 27 | return schedule 28 | 29 | 30 | class SimpleTimer: 31 | def __init__(self): 32 | self._start = None 33 | 34 | def start(self): 35 | self._start = time.perf_counter() 36 | 37 | def stop(self): 38 | self.elapsed = time.perf_counter() - self._start 39 | self._start = None 40 | 41 | def __enter__(self): 42 | self.start() 43 | return self 44 | 45 | def __exit__(self, *exc_args): 46 | self.stop() 47 | -------------------------------------------------------------------------------- /natural-language-processing/images/bert-pipelining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/bert-pipelining.png -------------------------------------------------------------------------------- /natural-language-processing/images/bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/bert.png -------------------------------------------------------------------------------- /natural-language-processing/images/causal_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/causal_language_modeling.png -------------------------------------------------------------------------------- /natural-language-processing/images/masked_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/masked_language_modeling.png -------------------------------------------------------------------------------- /natural-language-processing/images/name_entity_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/name_entity_extraction.png -------------------------------------------------------------------------------- /natural-language-processing/images/partitioning.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/partitioning.jpg -------------------------------------------------------------------------------- /natural-language-processing/images/pipelining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/pipelining.png -------------------------------------------------------------------------------- /natural-language-processing/images/question_answering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/question_answering.png -------------------------------------------------------------------------------- /natural-language-processing/images/recomputation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/recomputation.png -------------------------------------------------------------------------------- /natural-language-processing/images/rts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/rts.png -------------------------------------------------------------------------------- /natural-language-processing/images/squad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/squad.png -------------------------------------------------------------------------------- /natural-language-processing/images/summarization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/summarization.png -------------------------------------------------------------------------------- /natural-language-processing/images/t5_vs_flan_t5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/t5_vs_flan_t5.png -------------------------------------------------------------------------------- /natural-language-processing/images/text_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/text_classification.png -------------------------------------------------------------------------------- /natural-language-processing/images/token_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/token_classification.png -------------------------------------------------------------------------------- /natural-language-processing/images/translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/translation.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/bert-pipelining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/bert-pipelining.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/bert.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/causal_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/causal_language_modeling.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/masked_language_modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/masked_language_modeling.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/mt5_oom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/mt5_oom.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/name_entity_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/name_entity_extraction.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/partitioning.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/partitioning.jpg -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/pipelining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/pipelining.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/question_answering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/question_answering.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/recomputation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/recomputation.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/restart_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/restart_kernel.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/rts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/rts.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/squad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/squad.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/summarization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/summarization.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/text_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/text_classification.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/token_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/token_classification.png -------------------------------------------------------------------------------- /natural-language-processing/other-use-cases/images/translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/translation.png -------------------------------------------------------------------------------- /packed-bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/__init__.py -------------------------------------------------------------------------------- /packed-bert/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/models/__init__.py -------------------------------------------------------------------------------- /packed-bert/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/pipeline/__init__.py -------------------------------------------------------------------------------- /packed-bert/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/utils/__init__.py -------------------------------------------------------------------------------- /packed-bert/utils/packing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/utils/packing/__init__.py -------------------------------------------------------------------------------- /stable-diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | optimum-graphcore==0.7 2 | matplotlib 3 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3 4 | -------------------------------------------------------------------------------- /stable-diffusion/sample_images/image_to_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/image_to_image.png -------------------------------------------------------------------------------- /stable-diffusion/sample_images/inpainting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/inpainting.png -------------------------------------------------------------------------------- /stable-diffusion/sample_images/text_to_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/text_to_image.png -------------------------------------------------------------------------------- /stable-diffusion/sample_images/text_to_image_sd2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/text_to_image_sd2.png -------------------------------------------------------------------------------- /useful-tips/images/connect-tunnel-from-web-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-from-web-1.png -------------------------------------------------------------------------------- /useful-tips/images/connect-tunnel-from-web-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-from-web-2.png -------------------------------------------------------------------------------- /useful-tips/images/connect-tunnel-to-app-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-to-app-1.png -------------------------------------------------------------------------------- /useful-tips/images/connect-tunnel-to-app-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-to-app-2.png -------------------------------------------------------------------------------- /useful-tips/images/login-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/login-code.png -------------------------------------------------------------------------------- /useful-tips/images/login-success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/login-success.png -------------------------------------------------------------------------------- /useful-tips/images/restart_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/restart_kernel.png -------------------------------------------------------------------------------- /useful-tips/images/tunnel-unregister.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/tunnel-unregister.png --------------------------------------------------------------------------------