├── .github
    ├── deployment-configs
    │   ├── deploy-deberta.yaml
    │   ├── deploy-flan-t5.yaml
    │   ├── deploy-gpt-j-pod-4.yaml
    │   ├── deploy-huggingface-paperspace.yaml
    │   ├── deploy-licenses.yaml
    │   ├── deploy-magma.yaml
    │   ├── deploy-optimum-7.1.yaml
    │   └── deploy-whisper.yaml
    ├── pull_request_template.md
    ├── test_configs
    │   └── image-config.yaml
    └── workflows
    │   ├── 1-static-checks.yml
    │   ├── 2-local-vpod-tests.yml
    │   ├── 3-probe-on-ps.yml
    │   ├── configs
    │       ├── pre-commit-config.yaml
    │       ├── pylint.rc
    │       └── ruff.toml
    │   ├── copy-notebooks-from-source.yml
    │   └── sync-repos.yml
├── .gradient
    ├── available_ipus.py
    ├── check_tier.py
    ├── notebook-tests.yaml
    ├── prepare-datasets.sh
    ├── settings.yaml
    └── symlink_config.json
├── LICENSE
├── README.md
├── README_first.ipynb
├── audio-processing
    ├── requirements.txt
    ├── wav2vec2-fine-tuning-checkpoint.ipynb
    └── wav2vec2-inference-checkpoint.ipynb
├── dolly2-instruction-following
    ├── Dolly2-an-OSS-instruction-LLM.ipynb
    ├── OpenAssistant-Pythia-12B-Chatbot.ipynb
    ├── api
    │   ├── __init__.py
    │   └── pipeline.py
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   └── inference.yml
    ├── inference.py
    ├── modelling
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── decoder.py
    │   ├── dolly_lm.py
    │   ├── dolly_model.py
    │   ├── embedding.py
    │   ├── feed_forward.py
    │   ├── hf_mapping.py
    │   └── rotary_pos_embed
    │   │   ├── __init__.py
    │   │   ├── common.hpp
    │   │   ├── rotary_pos_embed.cpp
    │   │   ├── rotary_pos_embed.hpp
    │   │   ├── rotary_pos_embed.py
    │   │   ├── rotary_pos_embed_binding.cpp
    │   │   ├── rotary_pos_embedx.cpp
    │   │   └── rotary_pos_embedx.hpp
    ├── requirements.txt
    ├── tests
    │   ├── conftest.py
    │   ├── integration
    │   │   ├── execution
    │   │   │   └── test_execution.py
    │   │   └── layers
    │   │   │   ├── test_attention_TP.py
    │   │   │   ├── test_decoder_block_TP.py
    │   │   │   ├── test_feed_forward_TP.py
    │   │   │   ├── test_lm_TP.py
    │   │   │   └── test_model_TP.py
    │   └── test_config.yml
    └── utils
    │   ├── __init__.py
    │   ├── setup.py
    │   └── simple_parsing_tools.py
├── gptj-text-generation
    ├── GPTJ-generative-inference.ipynb
    ├── GPTJ-group-quantized.ipynb
    ├── api.py
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   ├── finetuning.yml
    │   └── inference.yml
    ├── data
    │   ├── __init__.py
    │   ├── data_utils.py
    │   ├── hf_data_utils.py
    │   └── mnli_data.py
    ├── finetuning.ipynb
    ├── finetuning.py
    ├── imgs
    │   ├── bs_buffers.png
    │   ├── data_parallelism.png
    │   ├── dp_tp.png
    │   ├── execution.jpg
    │   ├── gq-speed-accuracy-tradeoff.png
    │   ├── mnli_dataset.png
    │   ├── rts.png
    │   ├── tensor_parallelism.png
    │   ├── tp.jpg
    │   └── tp_dp_rts.png
    ├── inference.py
    ├── modelling
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── decoder.py
    │   ├── embedding.py
    │   ├── feed_forward.py
    │   ├── gptj_lm.py
    │   ├── gptj_model.py
    │   └── hf_mapping.py
    ├── pytest.ini
    ├── requirements-dev.txt
    ├── requirements.txt
    ├── run_finetuning.py
    ├── run_inference.py
    ├── run_validation.py
    ├── tests
    │   ├── conftest.py
    │   ├── integration
    │   │   ├── execution
    │   │   │   ├── test_execution.py
    │   │   │   └── test_overfitting.py
    │   │   └── layers
    │   │   │   ├── test_attention_TP.py
    │   │   │   ├── test_decoder_block_TP.py
    │   │   │   ├── test_feed_forward_TP.py
    │   │   │   ├── test_gptj_TP.py
    │   │   │   └── test_lm_TP.py
    │   ├── test_config.yml
    │   └── unit
    │   │   └── test_dataloder.py
    ├── tests_serial
    │   ├── dataloader_checkpoints.py
    │   ├── distributed_sampler.py
    │   └── test_distributed_data.py
    └── utils
    │   ├── __init__.py
    │   ├── inference.py
    │   ├── pipeline.py
    │   ├── setup.py
    │   ├── simple_parsing_tools.py
    │   ├── trainer.py
    │   └── utils.py
├── image-classification
    ├── LICENSE
    └── image_classification.ipynb
├── images
    ├── folder_logo.png
    ├── go_emotions.png
    └── jupyter_logo.png
├── llama2-chatbot
    ├── .gitignore
    ├── LICENSE
    ├── api
    │   ├── __init__.py
    │   └── pipeline.py
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   └── inference.yml
    ├── inference.py
    ├── llama2-inference.ipynb
    ├── modelling
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── decoder.py
    │   ├── embedding.py
    │   ├── feed_forward.py
    │   ├── hf_mapping.py
    │   ├── llama_lm.py
    │   ├── llama_model.py
    │   ├── rms_norm.py
    │   └── rotary_pos_embed
    │   │   ├── .rendered.rotary_pos_embed_binding.cpp
    │   │   ├── __init__.py
    │   │   ├── common.hpp
    │   │   ├── rotary_pos_embed.cpp
    │   │   ├── rotary_pos_embed.hpp
    │   │   ├── rotary_pos_embed.py
    │   │   ├── rotary_pos_embed_binding.cpp
    │   │   ├── rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so
    │   │   ├── rotary_pos_embedx.cpp
    │   │   └── rotary_pos_embedx.hpp
    ├── pytest.ini
    ├── requirements.txt
    ├── run-inference.py
    ├── tests
    │   ├── conftest.py
    │   ├── integration
    │   │   ├── execution
    │   │   │   └── test_execution.py
    │   │   └── layers
    │   │   │   ├── test_attention_TP.py
    │   │   │   ├── test_decoder_block_TP.py
    │   │   │   ├── test_feed_forward_TP.py
    │   │   │   ├── test_lm_TP.py
    │   │   │   └── test_model_TP.py
    │   └── test_config.yml
    └── utils
    │   ├── __init__.py
    │   ├── setup.py
    │   └── simple_parsing_tools.py
├── molfeat
    ├── requirements.txt
    ├── transformers_molfeat_finetune.ipynb
    └── utils.py
├── multimodal
    └── magma
    │   ├── Image-description-using-MAGMA.ipynb
    │   ├── configs
    │       ├── MAGMA_v1.yml
    │       ├── __init__.py
    │       ├── config.py
    │       └── inference.yml
    │   ├── demo_example_images
    │       ├── cantaloupe_popsicle.jpg
    │       ├── circles.jpg
    │       ├── circles_square.jpg
    │       ├── korea.jpg
    │       ├── matterhorn.jpg
    │       ├── mushroom.jpg
    │       ├── people.jpg
    │       ├── playarea.jpg
    │       ├── popsicle.png
    │       ├── rainbow_popsicle.jpeg
    │       └── table_tennis.jpg
    │   ├── images
    │       ├── MagmaStructure.png
    │       └── demo_magma.png
    │   ├── inference.py
    │   ├── modelling
    │       ├── __init__.py
    │       ├── adapters_TP.py
    │       ├── clip_resnet
    │       │   ├── __init__.py
    │       │   ├── attention_pool.py
    │       │   ├── batch_norm.py
    │       │   ├── bottleneck.py
    │       │   ├── modified_resnet.py
    │       │   └── stem.py
    │       ├── gptj
    │       │   ├── __init__.py
    │       │   ├── attention.py
    │       │   ├── decoder.py
    │       │   ├── embedding.py
    │       │   ├── feed_forward.py
    │       │   ├── finetuneanon_mapping.py
    │       │   ├── gptj_lm.py
    │       │   └── gptj_model.py
    │       ├── image_prefix.py
    │       └── magma_mapping.py
    │   ├── requirements.txt
    │   ├── run_inference.py
    │   └── utils
    │       ├── __init__.py
    │       ├── sampling.py
    │       ├── setup.py
    │       └── simple_parsing_tools.py
├── natural-language-processing
    ├── Flan-T5-generative-inference.ipynb
    ├── LICENSE
    ├── doing-more-with-flan-t5
    │   ├── Flan-T5-generative-inference.ipynb
    │   ├── Flan-T5-textual-entailment-fine-tuning.ipynb
    │   ├── api.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── finetuning.yml
    │   │   └── inference.yml
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── data_utils.py
    │   │   └── mnli_data.py
    │   ├── finetuning.py
    │   ├── graphs
    │   │   ├── __init__.py
    │   │   ├── embedding.py
    │   │   ├── encoder_decoder.py
    │   │   ├── graphs.py
    │   │   └── head.py
    │   ├── imgs
    │   │   └── mnli_dataset.png
    │   ├── inference.py
    │   ├── modelling
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── embedding.py
    │   │   ├── encoder_decoder.py
    │   │   ├── feed_forward.py
    │   │   ├── hf_mapping.py
    │   │   ├── layer_norm.py
    │   │   ├── t5_lm.py
    │   │   └── t5_model.py
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   ├── run_finetuning.py
    │   ├── run_validation.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── inference.py
    │   │   ├── pipeline.py
    │   │   ├── setup.py
    │   │   ├── simple_parsing_tools.py
    │   │   ├── trainer.py
    │   │   └── utils.py
    ├── images
    │   ├── bert-pipelining.png
    │   ├── bert.png
    │   ├── causal_language_modeling.png
    │   ├── masked_language_modeling.png
    │   ├── name_entity_extraction.png
    │   ├── partitioning.jpg
    │   ├── pipelining.png
    │   ├── question_answering.png
    │   ├── recomputation.png
    │   ├── rts.png
    │   ├── squad.png
    │   ├── summarization.png
    │   ├── t5_vs_flan_t5.png
    │   ├── text_classification.png
    │   ├── token_classification.png
    │   └── translation.png
    ├── introduction_to_optimum_graphcore.ipynb
    ├── name-entity-extraction.ipynb
    ├── other-use-cases
    │   ├── deberta-blog-notebook.ipynb
    │   ├── external_model.ipynb
    │   ├── images
    │   │   ├── bert-pipelining.png
    │   │   ├── bert.png
    │   │   ├── causal_language_modeling.png
    │   │   ├── masked_language_modeling.png
    │   │   ├── mt5_oom.png
    │   │   ├── name_entity_extraction.png
    │   │   ├── partitioning.jpg
    │   │   ├── pipelining.png
    │   │   ├── question_answering.png
    │   │   ├── recomputation.png
    │   │   ├── restart_kernel.png
    │   │   ├── rts.png
    │   │   ├── squad.png
    │   │   ├── summarization.png
    │   │   ├── text_classification.png
    │   │   ├── token_classification.png
    │   │   └── translation.png
    │   ├── language_modelling_from_scratch.ipynb
    │   ├── mt5_translation.ipynb
    │   ├── mt5_xnli.ipynb
    │   ├── multiple_choice.ipynb
    │   ├── question_answering.ipynb
    │   ├── summarization.ipynb
    │   ├── text_classification.ipynb
    │   └── token_classification.ipynb
    ├── sentiment_analysis.ipynb
    ├── squad_preprocessing.py
    ├── text-embeddings-models
    │   ├── config.py
    │   └── text-embeddings-on-ipu.ipynb
    ├── text_summarization_BART_L_inference.ipynb
    └── translation.ipynb
├── packed-bert
    ├── LICENSE
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   └── modeling_bert_packed.py
    ├── packedBERT_multi_label_text_classification.ipynb
    ├── packedBERT_question_answering.ipynb
    ├── packedBERT_single_label_text_classification.ipynb
    ├── pipeline
    │   ├── __init__.py
    │   └── packed_bert.py
    └── utils
    │   ├── __init__.py
    │   └── packing
    │       ├── __init__.py
    │       ├── algorithms.py
    │       ├── dataset_creator.py
    │       ├── dataset_templates.py
    │       └── qa_utils.py
├── setup.sh
├── stable-diffusion
    ├── LICENSE
    ├── image_to_image.ipynb
    ├── inpainting.ipynb
    ├── requirements.txt
    ├── sample_images
    │   ├── image_to_image.png
    │   ├── inpainting.png
    │   ├── text_to_image.png
    │   └── text_to_image_sd2.png
    ├── text_to_image.ipynb
    └── text_to_image_sd2.ipynb
├── useful-tips
    ├── images
    │   ├── connect-tunnel-from-web-1.png
    │   ├── connect-tunnel-from-web-2.png
    │   ├── connect-tunnel-to-app-1.png
    │   ├── connect-tunnel-to-app-2.png
    │   ├── login-code.png
    │   ├── login-success.png
    │   ├── restart_kernel.png
    │   └── tunnel-unregister.png
    ├── managing_ipu_resources.ipynb
    └── using_vscode_in_paperspace.ipynb
└── whisper
    ├── LICENSE
    ├── whisper-example.ipynb
    ├── whisper-quantized-example.ipynb
    └── whisper_finetuning.ipynb


/.github/deployment-configs/deploy-deberta.yaml:
--------------------------------------------------------------------------------
 1 | _optimum_graphcore_repository: &_optimum_graphcore_repository
 2 |     origin: https://github.com/huggingface/optimum-graphcore.git
 3 |     ref: main
 4 | 
 5 | _current_repo_in_github_actions: &_current_repo_in_github_actions
 6 |     origin: notebooks/
 7 |     ref: null
 8 | 
 9 | deberta-lukem:
10 |     source:
11 |       paths:
12 |       - expression: '*'
13 |         path: notebooks/deberta-blog-notebook.ipynb
14 |         recursive: true
15 |       repository:
16 |         origin: https://github.com/huggingface/optimum-graphcore.git
17 |         prefix: notebooks/
18 |         ref: main
19 |     target:
20 |       renames: {}
21 |       repository:
22 |         <<: *_current_repo_in_github_actions
23 |         prefix: natural-language-processing/other-use-cases/
24 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-flan-t5.yaml:
--------------------------------------------------------------------------------
 1 | _examples_internal_repository: &_examples_internal_repository
 2 |   origin: examples-internal/
 3 |   ref: null
 4 | 
 5 | 
 6 | _common_target_repository: &_common_target_repository
 7 |     origin: notebooks/
 8 |     ref: null
 9 | 
10 | 
11 | flan-t5:
12 |   source:
13 |     paths:
14 |     - expression: '*'
15 |       path: nlp/t5/popxl
16 |       recursive: true
17 |     excludes:
18 |     - path: nlp/t5/popxl/pytest.ini
19 |     - path: nlp/t5/popxl/.ci
20 |     - path: nlp/t5/popxl/README.md
21 |     - path: nlp/t5/popxl/.gitignore
22 |     - path: nlp/t5/popxl/tests
23 |     repository:
24 |       <<: *_examples_internal_repository
25 |       prefix: nlp/t5/popxl
26 |   target:
27 |     renames: {}
28 |     repository:
29 |       <<: *_common_target_repository
30 |       prefix: natural-language-processing/doing-more-with-flan-t5/
31 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-gpt-j-pod-4.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository
 3 |   origin: https://github.com/huggingface/optimum-graphcore.git
 4 |   ref: main
 5 | 
 6 | _examples_internal_repository: &_examples_internal_repository
 7 |   origin: examples-internal/
 8 |   ref: null
 9 | 
10 | 
11 | _common_target_repository: &_common_target_repository
12 |     origin: notebooks/
13 |     ref: null
14 | 
15 | gptj:
16 |   source:
17 |     paths:
18 |     - path: nlp/gpt_j/popxl/GPTJ-generative-inference.ipynb
19 |     - path: nlp/gpt_j/popxl/config/inference.yml
20 |     excludes:
21 |     - path: nlp/gpt_j/popxl/README.md
22 |     - path: nlp/gpt_j/popxl/.gitignore
23 |     - path: nlp/gpt_j/popxl/.ci
24 |     repository:
25 |       <<: *_examples_internal_repository
26 |       prefix: nlp/gpt_j/popxl
27 |   target:
28 |     renames: {}
29 |     repository:
30 |       <<: *_common_target_repository
31 |       prefix: gptj-text-generation
32 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-licenses.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository
 3 |   origin: https://github.com/huggingface/optimum-graphcore.git
 4 |   ref: main
 5 | 
 6 | _examples_internal_repository: &_examples_internal_repository
 7 |   origin: examples-internal/
 8 |   ref: null
 9 | 
10 | 
11 | _common_target_repository: &_common_target_repository
12 |     origin: notebooks/
13 |     ref: null
14 | 
15 | _copy_apache_license: &_copy_apache_license
16 |   paths:
17 |   - expression: '*'
18 |     path: LICENSE
19 |     recursive: true
20 |   repository:
21 |     <<: *_optimum_graphcore_repository
22 |     prefix: ''
23 | 
24 | license-audio-classification:
25 |   source:
26 |     <<: *_copy_apache_license
27 |   target:
28 |     renames: {}
29 |     repository:
30 |       <<: *_common_target_repository
31 |       prefix: 'audio-classification'
32 | 
33 | license-audio-classification:
34 |   source:
35 |     <<: *_copy_apache_license
36 |   target:
37 |     renames: {}
38 |     repository:
39 |       <<: *_common_target_repository
40 |       prefix: 'audio-classification'
41 | 
42 | 
43 | license-audio-classification:
44 |   source:
45 |     <<: *_copy_apache_license
46 |   target:
47 |     renames: {}
48 |     repository:
49 |       <<: *_common_target_repository
50 |       prefix: 'audio-classification'
51 | 
52 | license-natural-language-processing:
53 |   source:
54 |     <<: *_copy_apache_license
55 |   target:
56 |     renames: {}
57 |     repository:
58 |       <<: *_common_target_repository
59 |       prefix: 'natural-language-processing'
60 | 
61 | license-image-classification:
62 |   source:
63 |     <<: *_copy_apache_license
64 |   target:
65 |     renames: {}
66 |     repository:
67 |       <<: *_common_target_repository
68 |       prefix: 'image-classification'
69 | 
70 | license-packed-bert:
71 |   source:
72 |     <<: *_copy_apache_license
73 |   target:
74 |     renames: {}
75 |     repository:
76 |       <<: *_common_target_repository
77 |       prefix: 'packed-bert'
78 | 
79 | license-stable-diffusion:
80 |   source:
81 |     <<: *_copy_apache_license
82 |   target:
83 |     renames: {}
84 |     repository:
85 |       <<: *_common_target_repository
86 |       prefix: 'stable-diffusion'
87 | 
88 | license-whisper:
89 |   source:
90 |     <<: *_copy_apache_license
91 |   target:
92 |     renames: {}
93 |     repository:
94 |       <<: *_common_target_repository
95 |       prefix: 'whisper'
96 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-magma.yaml:
--------------------------------------------------------------------------------
 1 | _examples_internal_repository: &_examples_internal_repository
 2 |   origin: examples-internal/
 3 |   ref: null
 4 | 
 5 | 
 6 | _common_target_repository: &_common_target_repository
 7 |     origin: notebooks/
 8 |     ref: null
 9 | 
10 | 
11 | flan-t5:
12 |   source:
13 |     paths:
14 |     - expression: '*'
15 |       path: multimodal/magma/popxl
16 |       recursive: true
17 |     excludes:
18 |     - path: multimodal/magma/popxl/pytest.ini
19 |     - path: multimodal/magma/popxl/.ci
20 |     - path: multimodal/magma/popxl/README.md
21 |     - path: multimodal/magma/popxl/.gitignore
22 |     - path: multimodal/magma/popxl/tests
23 |     repository:
24 |       <<: *_examples_internal_repository
25 |       prefix: multimodal/magma/popxl/
26 |   target:
27 |     renames: {}
28 |     repository:
29 |       <<: *_common_target_repository
30 |       prefix: multimodal/magma
31 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-optimum-7.1.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository
 3 |   origin: https://github.com/huggingface/optimum-graphcore.git
 4 |   ref: main
 5 | 
 6 | _examples_internal_repository: &_examples_internal_repository
 7 |   origin: examples-internal/
 8 |   ref: null
 9 | 
10 | 
11 | _common_target_repository: &_common_target_repository
12 |     origin: notebooks/
13 |     ref: null
14 | 
15 | whisper_quantized:
16 |   source:
17 |     paths:
18 |     - expression: '*'
19 |       path: notebooks/whisper-quantized-example.ipynb
20 |       recursive: true
21 |     repository:
22 |       <<: *_optimum_graphcore_repository
23 |       prefix: notebooks/
24 |   target:
25 |     renames: {}
26 |     repository:
27 |       <<: *_common_target_repository
28 |       prefix: whisper/
29 | 
30 | natural-language-processing-main:
31 |   source:
32 |     paths:
33 |     - path: notebooks/text_summarization.ipynb
34 |     repository:
35 |       <<: *_optimum_graphcore_repository
36 |       prefix: notebooks/
37 |   target:
38 |     renames: {natural-language-processing/text_summarization.ipynb: natural-language-processing/text_summarization_BART_L_inference.ipynb}
39 |     repository:
40 |       <<: *_common_target_repository
41 |       prefix:  natural-language-processing/
42 | 
43 | text_embeddings_models:
44 |   source:
45 |     paths:
46 |     - expression: '*'
47 |       path: notebooks/text_embeddings_models/config.py
48 |       path: notebooks/text_embeddings_models/text-embeddings-on-ipu.ipynb
49 |       recursive: true
50 |     repository:
51 |       <<: *_optimum_graphcore_repository
52 |       prefix: notebooks/text_embeddings_models/
53 |   target:
54 |     repository:
55 |       <<: *_common_target_repository
56 |       prefix: natural-language-processing/text-embeddings-models/
57 | 


--------------------------------------------------------------------------------
/.github/deployment-configs/deploy-whisper.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | _optimum_graphcore_repository: &_optimum_graphcore_repository
 3 |   origin: https://github.com/huggingface/optimum-graphcore.git
 4 |   ref: main
 5 | 
 6 | _examples_internal_repository: &_examples_internal_repository
 7 |   origin: examples-internal/
 8 |   ref: null
 9 | 
10 | 
11 | _common_target_repository: &_common_target_repository
12 |     origin: notebooks/
13 |     ref: null
14 | 
15 | whisper:
16 |   source:
17 |     paths:
18 |     - expression: '*'
19 |       path: notebooks/whisper-example.ipynb
20 |       recursive: true
21 |     repository:
22 |       <<: *_optimum_graphcore_repository
23 |       prefix: notebooks/
24 |   target:
25 |     renames: {}
26 |     repository:
27 |       <<: *_common_target_repository
28 |       prefix: early-access/whisper/
29 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Adding a new notebook checklist
 2 | 
 3 | [Contributing a notebook on confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3242393645/Contributing+a+notebook) contains full instructions, if you have questions please ask on #internal-paperspace-graphcore by tagging @aie-paperspace, here is the checklist:
 4 | 
 5 | - [ ] Your notebook should exist and have been landed in another repository (examples or optimum-graphcore) - (this can be skipped in rare instances)
 6 |     - [ ] Make it configurable by environment variables [see notebook technical guidelines](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3098345498/Writing+a+Paperspace+notebook#Reading-configuration-in-notebooks)
 7 |     - [ ] Make sure it has a compliant title [See notebook content guidelines](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3094381247/Notebooks+guidelines)
 8 | - [ ] Add an entry to `.github/deployment-configs/` to copy it over - this will create a PR with files automatically copied over for you. You will need to merge it into your branch (simply click the merge button on the automated PR, it will do the right thing). The config format is defined in [graphcore/paperspace-automation - deployment](https://github.com/graphcore/paperspace-automation/tree/main/deployment)
 9 |     - [ ] remove READMEs (they do not render on Paperspace)
10 |     - [ ] make sure appropriate licence is included (MIT: no action needed, other licenses need to be added to folder)
11 |     - [ ] Once the file structure matches what you want, merge the PR that was automatically created, ask for feedback from #internal-paperspace-graphcore if you are not sure about the file structure to adopt
12 | - [ ] Generate a short link [confluence instructions](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3219194169/Generating+a+short+URL+for+a+Paperspace+notebook) and add a ROG button on the notebook
13 | - [ ] Make minimal Paperspace specific changes
14 |   - [ ] remove relative links in Markdown text (unsupported on Paperspace). Either use full URLs to github, or print the relative path as code e.g. "... the notebook at `../tutorial3/walkthrough.ipynb`"
15 |   - [ ] unpin matplotlib, pandas and numpy requirements
16 |   - [ ] Make sure the graphcore-cloud-tools logger is added
17 | - [ ] Add an entry to test the notebook in `.gradient/notebooks-tests.yaml`
18 | - [ ] Add the notebook to the `README_first.ipynb`
19 | - [ ] Dataset, checkpoint, poplar cache upload ([dataset management - confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3226206448/Paperspace+dataset+management))
20 |   - [ ] Upload any required datasets, checkpoints and caches to `/a/scratch/ai-public-datasets`
21 |   - [ ] Symlink any new datasets by editing `.gradient/symlink_config.json`, symlinks are from the read only `PUBLIC_DATASETS_DIR` to the appropriate read/write equivalent `DATASETS_DIR`, `CHECKPOINT_DIR`, `HF_DATASETS_CACHE`, etc... (see `setup.sh` for possibilities)
22 |   - [ ] if you need new environment variables defined, make changes to `setup.sh`
23 |   - [ ] Download files generated during the CI run which will be cached from AWS ([download - AWS data - confluence](https://graphcore.atlassian.net/wiki/spaces/PM/pages/3226206448/Paperspace+dataset+management#Accessing-artefacts-generated-in-Github-actions))
24 |   - [ ] Upload datasets, checkpoints and other caches to gradient datasets
25 |   - [ ] If you have created a new dataset, add corresponding entry to `.gradient/settings.yaml` 
26 | - [ ] Test on Paperspace: you can trigger a test on Paperspace by using the "workflow dispatch" trigger in Github Actions and changing "Local" to "Paperspace" (you can also do this manually)
27 | 
28 | Once all this is done, or steps have been agreed to be unnecessary, merge this PR 🙂
29 | 
30 | Don't forget to tell #internal-paperspace-graphcore that the PR has landed
31 | 


--------------------------------------------------------------------------------
/.github/test_configs/image-config.yaml:
--------------------------------------------------------------------------------
 1 | # This config file allows you to specify whether notebooks need to be run with different configs in CI.
 2 | # This is to allow us to test notebooks with different parameters - such as changing docker images
 3 | # - `default` is related to the default CI testing infrastructure, with the currently released SDK, fill this if you would like to test on both the default and a specific config
 4 | # - `early-access-tests` use an early release docker container and will run tests for these notebooks separately.
 5 | 
 6 | default:
 7 |   test_names: []
 8 |   
 9 | 
10 | 


--------------------------------------------------------------------------------
/.github/workflows/2-local-vpod-tests.yml:
--------------------------------------------------------------------------------
 1 | name: 2. Tests on Local vPOD
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       test_env:
 7 |         type: choice
 8 |         description: "Testing environment"
 9 |         required: false
10 |         default: 'Local'
11 |         options:
12 |         - Local
13 |         - Paperspace
14 |       docker_image:
15 |         type: string
16 |         description: "Docker image used in notebook testing"
17 |         required: false
18 |         default: "graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703"
19 |       notebooks:
20 |         type: string
21 |         description: "List of notebooks to test in JSON format"
22 |         required: false
23 |         default: '["Graphcore-HuggingFace-README_first"]'
24 |       machine_types:
25 |         type: string
26 |         description: "List of machines types"
27 |         required: false
28 |         default: '["IPU-POD4"]'
29 |       test_mode:
30 |         type: string
31 |         description: "The test workload that we are running, default or config set in the .github/test_configs/image-config.yaml"
32 |         required: false
33 |       test_config:
34 |         type: string
35 |         description: "Config which can be used to define special parameters such as docker image."
36 |         default: ".github/test_configs/image-config.yaml"
37 |         required: false
38 |       local_cache_type:
39 |         type: choice
40 |         description: "Use PURE filesystem mount or s3 caches, s3 cache takes 5mn."
41 |         required: false
42 |         options:
43 |           - mount
44 |           - s3
45 | 
46 |   pull_request:
47 |     branches-ignore:
48 |       - 'gh-action-branches/**'
49 |   schedule:
50 |     # run at 7:00 PM GMT every night
51 |     - cron:  '0 19 * * TUE,FRI'
52 | 
53 | 
54 | jobs:
55 |   tests:
56 |     uses: graphcore/paperspace-automation/.github/workflows/subwf-vpod-tests-for-nb-repo.yml@main
57 |     with:
58 |       docker_image: ${{ inputs.docker_image || 'graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703' }}
59 |       notebooks: ${{ inputs.notebooks }}
60 |       machine_types: ${{ inputs.machine_types }}
61 |       test_env: ${{ inputs.test_env || 'Local' }}
62 |       test_mode: "default"
63 |       test_config: ${{ inputs.test_config || '.github/test_configs/image-config.yaml' }}
64 |       # Use mounts on PRs as they are faster and s3 in nightlies as they are more representative
65 |       local_cache_type: ${{ inputs.local_cache_type || (github.event_name == 'pull_request' && 'mount') || 's3' }}
66 |     secrets:
67 |       gh_user: ${{ secrets.GH_TOKEN_USER }}
68 |       gh_token:  ${{ secrets.GH_TOKEN_SYNC_REPOS }}
69 |       hugging_face_hub_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
70 |       slack_bot_token: ${{ secrets.SLACK_BOT_TOKEN }}
71 |       ci_slack_channel_id: ${{ secrets.CI_SLACK_CHANNEL_ID }}
72 |       aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
73 |       aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
74 |       dataset_s3_download_b64_credential: ${{ secrets.DATASET_S3_DOWNLOAD_B64_CREDENTIAL }}
75 |       paperspace_api_key: ${{ secrets.PAPERSPACE_API_KEY }}
76 |       gradient_ui_email: ${{ secrets.GRADIENT_UI_EMAIL }}
77 |       gradient_ui_password: ${{ secrets.GRADIENT_UI_PASSWORD }}
78 |       gradient_validation_key: ${{ secrets.GRADIENT_VALIDATION_KEY }}
79 | 
80 | 


--------------------------------------------------------------------------------
/.github/workflows/3-probe-on-ps.yml:
--------------------------------------------------------------------------------
 1 | name: 3. Probe on PS env
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       docker_image:
 7 |         type: string
 8 |         description: "Docker image used in notebook testing"
 9 |         required: false
10 |         default: "graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703"
11 |   pull_request:
12 |   schedule:
13 |     # run every 6h and at 9am
14 |     - cron:  '05 0,6,9,12,18 * * *'
15 | 
16 | 
17 | jobs:
18 |   probe-in-ps:
19 |     name: Probe in PS env
20 |     uses: graphcore/paperspace-automation/.github/workflows/subwf-probe-in-ps.yml@main
21 |     with:
22 |       docker_image: ${{ inputs.docker_image || 'graphcore/pytorch-paperspace:3.3.0-ubuntu-20.04-20230703' }}
23 |     secrets:
24 |       gh_user: ${{ secrets.GH_TOKEN_USER }}
25 |       gh_token:  ${{ secrets.GH_TOKEN_SYNC_REPOS }}
26 |       paperspace_api_key: ${{ secrets.PAPERSPACE_API_KEY }}
27 |       gradient_ui_email: ${{ secrets.GRADIENT_UI_EMAIL }}
28 |       gradient_ui_password: ${{ secrets.GRADIENT_UI_PASSWORD }}
29 |       gradient_validation_key: ${{ secrets.GRADIENT_VALIDATION_KEY }}
30 |       slack_bot_token: ${{ secrets.SLACK_BOT_TOKEN }}
31 |       ci_slack_channel_id: ${{ secrets.CI_SLACK_CHANNEL_ID }}
32 | 


--------------------------------------------------------------------------------
/.github/workflows/configs/pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # NOTE: The versions can be updated by calling
 2 | #        pre-commit autoupdate
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.4.0
 6 |     hooks:
 7 |       - id: check-json
 8 |       - id: check-yaml
 9 |       - id: check-merge-conflict
10 |       - id: debug-statements
11 |       - id: check-added-large-files
12 |       - id: end-of-file-fixer
13 |       - id: mixed-line-ending
14 |         args: ["--fix=lf"]
15 |       - id: trailing-whitespace
16 |       - id: detect-private-key
17 | 
18 |   - repo: https://github.com/psf/black
19 |     rev: 22.12.0
20 |     hooks:
21 |       - id: black
22 |         args: [--line-length, "120", --skip-string-normalization]
23 |       - id: black-jupyter
24 |         files: '.*\.ipynb'
25 |   - repo: https://github.com/codespell-project/codespell
26 |     rev: v2.2.4
27 |     hooks:
28 |       - id: codespell
29 | 


--------------------------------------------------------------------------------
/.github/workflows/configs/ruff.toml:
--------------------------------------------------------------------------------
 1 | # Enable flake8-bugbear (`B`) rules.
 2 | #select = ["E", "F", "B"]
 3 | 
 4 | # Never enforce `E501` (line length violations).
 5 | ignore = ["E501"]
 6 | 
 7 | # Avoid trying to fix flake8-bugbear (`B`) violations.
 8 | #unfixable = ["B"]
 9 | 
10 | # Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`.
11 | #[per-file-ignores]
12 | #"__init__.py" = ["E402"]
13 | #"path/to/file.py" = ["E402"]
14 | 


--------------------------------------------------------------------------------
/.github/workflows/copy-notebooks-from-source.yml:
--------------------------------------------------------------------------------
 1 | name: Copy notebooks from source repos
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       deployment_spec:
 7 |         required: false
 8 |         type: string
 9 |         description: "Spec file to be read for copying notebooks, must be valid input to deployment/deploy.py"
10 |         default: .github/deployment-configs/deploy-deberta.yaml
11 |   pull_request:
12 | 
13 | 
14 | jobs:
15 |   tests:
16 |     uses: graphcore/paperspace-automation/.github/workflows/copy-to-nb-repo.yml@main
17 |     with:
18 |       deployment_spec: ${{ inputs.deployment_spec }}
19 |     secrets:
20 |       gh_token:  ${{ secrets.GH_TOKEN_SYNC_REPOS }}
21 | 


--------------------------------------------------------------------------------
/.github/workflows/sync-repos.yml:
--------------------------------------------------------------------------------
 1 | name: Sync to public repo
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |   schedule:
 9 |     # run at 10:00 PM GMT every night
10 |     - cron:  '0 22 * * *'
11 | 
12 | jobs:
13 | 
14 |   sync-repos:
15 |     name: Sync to public repo
16 |     runs-on: 'ubuntu-latest'
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |       with:
20 |         fetch-depth: 0
21 |         token: ${{ secrets.GH_TOKEN_SYNC_REPOS }}
22 |     - name: Sync repos
23 |       env:
24 |         DEST_REPO_URL: https://${{ secrets.GH_TOKEN_USER }}:${{ secrets.GH_TOKEN_SYNC_REPOS }}@github.com/graphcore/Gradient-HuggingFace
25 |       run: |
26 |         # checkout all remote branches
27 |         git checkout main
28 |         for BRANCH in $(git branch -a | grep remotes | grep -v HEAD | grep -v main); do
29 |             git branch --force --track "${BRANCH#remotes/origin/}" "${BRANCH}"
30 |         done
31 | 
32 |         # remove pull refs that are for Pull Requests, GitHub does not accept them
33 |         git for-each-ref --format 'delete %(refname)' refs/pull | git update-ref --stdin
34 | 
35 |         git remote add target "${DEST_REPO_URL}"
36 | 
37 |         git push --mirror target
38 | 


--------------------------------------------------------------------------------
/.gradient/available_ipus.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | import subprocess
 3 | import json
 4 | import warnings
 5 | import os
 6 | 
 7 | try:
 8 |     j = subprocess.check_output(['gc-monitor', '-j'], timeout=10)
 9 |     data = json.loads(j)
10 |     num_ipuMs = len(data["cards"])
11 |     num_ipus = 4 * num_ipuMs
12 | except subprocess.TimeoutExpired as err:
13 |     num_ipus = 0
14 |     print(num_ipus)
15 |     nb_id = os.getenv("PAPERSPACE_METRIC_WORKLOAD_ID", "unknown")
16 |     raise OSError(
17 |         "Connection to IPUs timed-out. This error indicates a problem with the "
18 |         "hardware you are running on. Please contact Paperspace Support referencing"
19 |         f" the Notebook ID: {nb_id}"
20 |     ) from err
21 | # to be captured as a variable in the bash script that calls this python script
22 | print(num_ipus)


--------------------------------------------------------------------------------
/.gradient/check_tier.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 3 | 
 4 | import os
 5 | 
 6 | hostname = os.getenv("HOSTNAME", "unknown")
 7 | 
 8 | # Free tier hosts
 9 | free_hostnames = [f"lr17-1-poplar-{i}" for i in range(1, 36)]
10 | free_hostnames.append("lr17-1-poplar-63")
11 | free_hostnames.append("lr17-1-poplar-64")
12 | 
13 | if hostname in free_hostnames:
14 |     print("FREE")
15 | else:
16 |     print("PAID")
17 | 


--------------------------------------------------------------------------------
/.gradient/prepare-datasets.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | set -uxo pipefail
 3 | run-tests() {
 4 |     # we do not exit on errors to make sure Paperspace notebooks get terminated
 5 |     set +e
 6 |     echo "PAPERSPACE-AUTOMATED-TESTING: Started testing"
 7 |     if [ "${8}" == "unset" ]; then
 8 |         EXAMPLES_UTILS_REV=latest_stable
 9 |     else
10 |         EXAMPLES_UTILS_REV=${8}
11 |     fi
12 |     python -m pip install gradient
13 |     python -m pip install "examples-utils[jupyter] @ git+https://github.com/graphcore/examples-utils@${EXAMPLES_UTILS_REV}"
14 | 
15 |     # set variable matching the standard Paperspace entry point
16 |     export PIP_DISABLE_PIP_VERSION_CHECK=1
17 |     export HUGGING_FACE_HUB_TOKEN=${7}
18 |     export VIRTUAL_ENV="/some/fake/venv/GC-automated-paperspace-test-${4}"
19 | 
20 |     LOG_FOLDER="${5}/log_${4}_$(date +'%Y-%m-%d-%H_%M_%S')"
21 |     mkdir -p ${LOG_FOLDER}
22 |     TEST_CONFIG_FILE="${6}"
23 |     # Run the health check script
24 |     HEALTH_CHECK_LOG_FOLDER="/storage/graphcore_health_checks"
25 |     python -m graphcore_cloud_tools.paperspace_utils.health_check --log-folder ${HEALTH_CHECK_LOG_FOLDER}
26 |     # Copy the health check logs to local log folder
27 |     HEALTH_CHECK_LOG_FILE=$(find ${HEALTH_CHECK_LOG_FOLDER} -type f | sort -n | tail -1)
28 |     cp ${HEALTH_CHECK_LOG_FILE} ${LOG_FOLDER}
29 | 
30 |     cd /notebooks/
31 |     echo "PAPERSPACE-AUTOMATED-TESTING: starting platform_assessment testing"
32 |     python -m examples_utils platform_assessment --spec ${TEST_CONFIG_FILE} "${@:9}" \
33 |         --log-dir $LOG_FOLDER \
34 |         --gc-monitor \
35 |         --cloning-directory /tmp/clones \
36 |         --additional-metrics
37 | 
38 |     exit_code=$?
39 |     tar -czvf "${LOG_FOLDER}.tar.gz" ${LOG_FOLDER}
40 |     echo "PAPERSPACE-AUTOMATED-TESTING: Testing complete with exit code ${exit_code}"
41 |     echo "Shutting down notebook"
42 | 
43 |     if [ "${PAPERSPACE_METRIC_WORKLOAD_ID:-}" ]
44 |     then
45 |         sleep 5
46 |         gradient apiKey ${1}
47 |         gradient notebooks stop --id ${PAPERSPACE_METRIC_WORKLOAD_ID}
48 |     fi
49 |     echo "Notebook Stopped"
50 | }
51 | 
52 | if [ ! "$(command -v fuse-overlayfs)" ]; then
53 |     echo "fuse-overlayfs not found installing - please update to our latest image"
54 |     apt update -y
55 |     apt install -o DPkg::Lock::Timeout=120 -y psmisc libfuse3-dev fuse-overlayfs
56 | fi
57 | 
58 | python -m pip install "graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3"
59 | 
60 | echo "Starting preparation of datasets"
61 | SCRIPT_DIR="$( dirname -- "${BASH_SOURCE[0]}" )"
62 | # Support passive credential cycling despite pinned dependency
63 | if [ -z "${DATASET_S3_DOWNLOAD_B64_CREDENTIAL:-}" ]; then
64 |     curl https://raw.githubusercontent.com/graphcore/graphcore-cloud-tools/main/graphcore_cloud_tools/paperspace_utils/auth.py > tmp_auth.py
65 |     DATASET_S3_DOWNLOAD_B64_CREDENTIAL=$(python3 tmp_auth.py)
66 |     rm tmp_auth.py
67 | fi
68 | 
69 | python -m graphcore_cloud_tools paperspace symlinks --s3-dataset \
70 |        --config-file ${SCRIPT_DIR}/symlink_config.json \
71 |        --gradient-settings-file ${SCRIPT_DIR}/settings.yaml --num-concurrent-downloads 20 --max-concurrency 1
72 | echo "Finished running prepare-datasets.sh"
73 | # Run automated test if specified
74 | if [[ "${1:-}" == 'test' ]]; then
75 |     ARGS="${@:2}"
76 | elif [[ "${2:-}" == 'test' ]]; then
77 |     ARGS="${@:3}"
78 | fi
79 | [ -n "${ARGS+x}" ] && run-tests $ARGS
80 | 
81 | echo "Finished running setup.sh."
82 | 


--------------------------------------------------------------------------------
/.gradient/settings.yaml:
--------------------------------------------------------------------------------
 1 | integrations:
 2 |   gcl:
 3 |     type: dataset
 4 |     ref: paperspace/ds7me5hgjbfht6q:8ngwr2a
 5 |   magma:
 6 |     type: dataset
 7 |     ref: graphcore-managed-s3
 8 |   poplar-executables-hf-3-3:
 9 |     type: dataset
10 |     ref: paperspace/ds367opyfl97110:be9cyhp
11 |   librispeech_asr:
12 |     type: dataset
13 |     ref: paperspace/ds1uofih1koi71b:xi0qac2
14 |   # removing superb while the audio classification nb is removed
15 |   # superb:
16 |   #   type: dataset
17 |   #   ref: paperspace/dsgrgvk6f7zecuw:fov2xl1
18 |   # removing downloads while the audio classification nb is removed
19 |   # graphcore-downloads:
20 |   #   type: dataset
21 |   #   ref: paperspace/ds52xkj0j1elf02:5nesnjo
22 |   dfki-sentinel-eurosat:
23 |     type: dataset
24 |     ref: paperspace/ds8p6sv96fl1att:k5j4cob
25 | 


--------------------------------------------------------------------------------
/.gradient/symlink_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "${POPLAR_EXECUTABLE_CACHE_DIR}":["${S3_DATASETS_DIR}/poplar-executables-hf-3-3/${SDK_VERSION}"],
3 |     "${HF_DATASETS_CACHE}/librispeech_asr":["${S3_DATASETS_DIR}/librispeech_asr"],
4 |     "${DATASETS_DIR}/dfki-sentinel-eurosat":["${S3_DATASETS_DIR}/dfki-sentinel-eurosat"],
5 |     "${DATASETS_DIR}/magma":["${S3_DATASETS_DIR}/magma"]    
6 | }
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 4 | 
 5 | Copyright (c) 2022 Gradient°
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Training 🤗 Models on IPUs using Paperspace Gradient
 2 | 
 3 | Whether you are looking to generate images with Stable Diffusion, derive insights from text, or need to recognize audio samples, the examples in here have you covered.
 4 | 
 5 | 
 6 | #### Join our Slack community to interact with other developers!
 7 | 
 8 | [![Join our Slack Community](https://img.shields.io/badge/Slack-Join%20Graphcore's%20Community-blue?style=flat-square&logo=slack)](https://www.graphcore.ai/join-community)
 9 | 
10 | 
11 | ## License
12 | 
13 | Unless otherwise specified by a LICENSE file in a subdirectory, the LICENSE referenced at the top level applies to the files in this repository.
14 | 
15 | “Jupyter” and the Jupyter logos are trademarks or registered trademarks of NumFOCUS, used by Graphcore with permission.


--------------------------------------------------------------------------------
/audio-processing/requirements.txt:
--------------------------------------------------------------------------------
1 | optimum-graphcore==0.7
2 | --find-links https://download.pytorch.org/whl/torch_stable.html
3 | torchaudio == 2.0.2+cpu
4 | librosa
5 | numpy>=1.22
6 | jiwer
7 | soundfile
8 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
9 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from .pipeline import DollyPipeline
4 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from .config import DollyConfig, Execution
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | CONFIG_DIR = Path(os.path.dirname(__file__))
 8 | 
 9 | del os, Path
10 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/config/inference.yml:
--------------------------------------------------------------------------------
 1 | # -------- Models --------
 2 | tiny: &tiny
 3 |   model:
 4 |     layers: 2
 5 |     hidden_size: 100
 6 |     sequence_length: 64
 7 |     attention:
 8 |       heads: 4
 9 |       rotary_dim: 4
10 |     embedding:
11 |       vocab_size: 150
12 | 
13 | dolly: &dolly
14 |   model:
15 |     layers: 36
16 |     hidden_size: 5120
17 |     sequence_length: 2048
18 |     attention:
19 |       heads: 40
20 |       rotary_positional_embeddings_base: 10000
21 |       rotary_dim: 32 # should be rotary_pct of head dim
22 |     embedding:
23 |       vocab_size: 50280
24 | # -------------------------
25 | 
26 | # ------- Execution -------
27 | release:
28 |   tiny:
29 |     <<: *tiny
30 |     execution:
31 |       micro_batch_size: 4
32 |       available_memory_proportion: [ 0.4 ]
33 |       tensor_parallel: 4
34 | 
35 |   dolly_pod4:
36 |     <<: *dolly
37 |     execution:
38 |       micro_batch_size: 1
39 |       available_memory_proportion: [ 0.4 ]
40 |       tensor_parallel: 4
41 | 
42 |   dolly_pod16:
43 |     <<: *dolly
44 |     execution:
45 |       micro_batch_size: 4
46 |       available_memory_proportion: [ 0.4 ]
47 |       tensor_parallel: 16
48 |       attention_tensor_parallel: 8
49 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | 
 5 | import popxl
 6 | from popxl import ops
 7 | from popxl.utils import to_numpy
 8 | 
 9 | import popxl_addons as addons
10 | from popxl_addons import NamedTensors
11 | from popxl_addons.layers import LayerNorm
12 | 
13 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer as HFModel
14 | 
15 | from config import DollyConfig
16 | from .attention import DollySelfAttentionTP
17 | from .feed_forward import DollyFeedForwardTP
18 | 
19 | 
20 | class DollyDecoderBlockTP(addons.Module):
21 |     def __init__(self, config: DollyConfig):
22 |         super().__init__()
23 |         self.config = config
24 |         # begins with identical computations: layer norm ln_1
25 |         self.ln_1 = LayerNorm()
26 |         self.ln_2 = LayerNorm()
27 |         # attention is sharded
28 |         # identical computation for bias and skip connection
29 |         self.attention = DollySelfAttentionTP(self.config)
30 |         # begins with identical computations: layer norm ln_2
31 |         # feed forward is sharded
32 |         # identical computation for bias, dropout and skip connection
33 |         self.feed_forward = DollyFeedForwardTP(self.config)
34 | 
35 |     def build(self, x: popxl.Tensor):
36 |         residual = x
37 |         attn_out = self.attention(self.ln_1(x))
38 | 
39 |         ff_out = self.feed_forward(self.ln_2(x))
40 |         x = attn_out + ff_out + residual
41 |         return x
42 | 
43 |     @staticmethod
44 |     def hf_mapping(config: DollyConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
45 |         dtype = config.model.dtype
46 |         weights = {
47 |             variables.ln_1.weight: to_numpy(hf_model.input_layernorm.weight.data, dtype),
48 |             variables.ln_1.bias: to_numpy(hf_model.input_layernorm.bias.data, dtype),
49 |             variables.ln_2.weight: to_numpy(hf_model.post_attention_layernorm.weight.data, dtype),
50 |             variables.ln_2.bias: to_numpy(hf_model.post_attention_layernorm.bias.data, dtype),
51 |         }
52 |         weights.update(DollySelfAttentionTP.hf_mapping(config, variables.attention, hf_model.attention))
53 |         weights.update(DollyFeedForwardTP.hf_mapping(config, variables.feed_forward, hf_model.mlp))
54 | 
55 |         return weights
56 | 
57 | 
58 | class DollyDecoderTP(addons.Module):
59 |     def __init__(self, config: DollyConfig):
60 |         super().__init__()
61 |         self.config = config
62 | 
63 |     def build(self, x: popxl.Tensor):
64 | 
65 |         facts, graph = DollyDecoderBlockTP(self.config).create_graph(x)  # Outline GPT Layer
66 | 
67 |         for i in range(self.config.model.layers):
68 |             args_nt = self.add_variable_inputs(i, facts)
69 |             (x,) = graph.bind(args_nt).call(x)
70 | 
71 |         return x
72 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/dolly_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | from config import DollyConfig
 5 | 
 6 | import popxl
 7 | from popxl.utils import to_numpy
 8 | 
 9 | import popxl_addons as addons
10 | from popxl_addons import NamedTensors
11 | 
12 | from popxl_addons.layers import LayerNorm
13 | 
14 | from .embedding import DollyEmbeddingsTP
15 | from .decoder import DollyDecoderTP, DollyDecoderBlockTP
16 | 
17 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel as HFModel
18 | 
19 | 
20 | class DollyModelTP(addons.Module):
21 |     def __init__(self, config: DollyConfig, include_layer_norm=True):
22 |         super().__init__()
23 |         self.config = config
24 |         # sharded, then last bit identical
25 |         self.embeddings = DollyEmbeddingsTP(self.config)
26 |         # identical inputs, then sharded, then identical
27 |         self.decoder = DollyDecoderTP(self.config)
28 |         # identical
29 |         self.include_layer_norm = include_layer_norm
30 |         if self.include_layer_norm:
31 |             self.ln_f = LayerNorm()
32 | 
33 |     def build(self, input_ids: popxl.Tensor):
34 |         x = self.embeddings(input_ids)
35 |         x = self.decoder(x)
36 |         if self.include_layer_norm:
37 |             x = self.ln_f(x)
38 |         return x
39 | 
40 |     @staticmethod
41 |     def hf_mapping(
42 |         config: DollyConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True
43 |     ) -> Dict[popxl.Tensor, np.ndarray]:
44 |         dtype = config.model.dtype
45 |         weights = {}
46 |         if layer_norm:
47 |             weights = {
48 |                 variables.ln_f.weight: to_numpy(hf_model.final_layer_norm.weight.data, dtype),
49 |                 variables.ln_f.bias: to_numpy(hf_model.final_layer_norm.bias.data, dtype),
50 |             }
51 | 
52 |         weights.update(DollyEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model))
53 | 
54 |         for l in range(config.model.layers):
55 |             weights.update(DollyDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.layers[l]))
56 | 
57 |         return weights
58 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/feed_forward.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from typing import Optional, List, Dict
 3 | import popxl
 4 | from popxl import ops
 5 | from popxl.utils import to_numpy
 6 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP as HFModel
 7 | 
 8 | import popxl_addons as addons
 9 | from config import DollyConfig
10 | from popxl_addons.layers import Linear
11 | import numpy as np
12 | 
13 | from popxl_addons.named_tensors import NamedTensors
14 | from popxl_addons.ops.replicated_all_reduce_TP import replicated_all_reduce
15 | from popxl_addons.array_munging import shard
16 | 
17 | 
18 | class DollyFeedForwardTP(addons.Module):
19 |     def __init__(self, config: DollyConfig, ff_size: Optional[int] = None):
20 |         super().__init__()
21 |         self.config = config
22 |         tp = config.execution.tensor_parallel
23 |         dp = config.execution.data_parallel
24 |         self.n_shards = tp
25 |         self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=dp)
26 |         # Also known as the intermediate size
27 |         self.ff_size = 4 * config.model.hidden_size if ff_size is None else ff_size
28 |         assert self.ff_size % self.n_shards == 0
29 |         # ----- Layers -----
30 |         # Sharded across devices - column wise
31 |         self.intermediate = Linear(self.ff_size // self.n_shards, replica_grouping=self.replica_grouping)
32 | 
33 |         # Sharded across devices - row wise (bias applied separately)
34 |         self.output = Linear(config.model.hidden_size, bias=False, replica_grouping=self.replica_grouping)
35 | 
36 |     def build(self, x: popxl.Tensor) -> List[popxl.Tensor]:
37 |         """Identical input (x, seed) and identical output across shards."""
38 |         # ----- Sharded computation -----
39 | 
40 |         # Shard column-wise since gelu is not linear.
41 |         # Indeed, sharding row wise requires a sum AllReduce at the end,
42 |         # but gelu is not linear: gelu(x+y) != gelu(x) + gelu(y)
43 |         z = self.intermediate(x)
44 |         z = ops.gelu(z)
45 |         # Here, x is already sharded across devices. Since we don't have non linearities,
46 |         # we can shard row-wise (which requires both X and the weight matrix to be sharded)
47 |         # and then perform an all reduce
48 |         z = self.output(z)
49 | 
50 |         z = replicated_all_reduce(z, group=self.replica_grouping.transpose())
51 | 
52 |         # ----- Identical computation -----
53 | 
54 |         # Output linear layer bias (identical bias on all devices)
55 |         self.bias = self.add_variable_input("bias", lambda: np.zeros(z.shape[-1]), z.dtype)
56 |         z = z + self.bias
57 | 
58 |         return z
59 | 
60 |     @staticmethod
61 |     def hf_mapping(config: DollyConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
62 |         dtype = config.model.dtype
63 |         n_shards = config.execution.tensor_parallel
64 | 
65 |         return {
66 |             # HF DollyMLP
67 |             variables.intermediate.weight: shard(
68 |                 to_numpy(hf_model.dense_h_to_4h.weight.data.T, dtype), n_shards, axis=-1
69 |             ),
70 |             variables.intermediate.bias: shard(to_numpy(hf_model.dense_h_to_4h.bias.data, dtype), n_shards, axis=-1),
71 |             variables.output.weight: shard(to_numpy(hf_model.dense_4h_to_h.weight.data.T, dtype), n_shards, axis=0),
72 |             variables.bias: to_numpy(hf_model.dense_4h_to_h.bias.data, dtype),
73 |         }
74 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/hf_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from typing import Dict
 4 | import numpy as np
 5 | 
 6 | from transformers.models.gpt_neox import GPTNeoXModel as HFModel
 7 | from transformers.models.gpt_neox import GPTNeoXForCausalLM as HFLMHeadModel
 8 | 
 9 | import popxl
10 | from popxl_addons import TaskSession
11 | 
12 | from config import DollyConfig
13 | from modelling.dolly_model import DollyModelTP
14 | from modelling.dolly_lm import DollyLMHeadModelTP
15 | 
16 | 
17 | def hf_mapping_lm_tp(
18 |     config: DollyConfig, session: TaskSession, pretrained: HFLMHeadModel
19 | ) -> Dict[popxl.Tensor, np.ndarray]:
20 |     load_to = session.state
21 |     if "fwd" in session.state:
22 |         load_to = session.state.fwd
23 |     weights = DollyLMHeadModelTP.hf_mapping(config, load_to, pretrained)
24 |     return weights
25 | 
26 | 
27 | def hf_mapping_TP(config: DollyConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
28 |     load_to = session.state
29 |     if "fwd" in session.state:
30 |         load_to = session.state.fwd
31 |     weights = DollyModelTP.hf_mapping(config, load_to, pretrained)
32 |     return weights
33 | 
34 | 
35 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel:
36 |     weights = session.get_named_tensors_data()
37 |     if "fwd" in weights:
38 |         weights = weights.fwd
39 |     state_dict = DollyLMHeadModelTP.to_hf(weights, hf_model)
40 |     # check only missing keys are mask-related keys
41 |     hf_state_keys = hf_model.state_dict().keys()
42 |     popxl_keys = state_dict.keys()
43 | 
44 |     def should_check(k: str):
45 |         return "attn.bias" not in k and "attn.masked_bias" not in k
46 | 
47 |     for k in hf_state_keys:
48 |         if should_check(k) and k not in popxl_keys:
49 |             raise KeyError(f"key {k} not found in session state")
50 | 
51 |     hf_model.load_state_dict(state_dict, strict=False)
52 |     return hf_model
53 | 
54 | 
55 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel:
56 |     weights = session.get_named_tensors_data()
57 |     if "fwd" in weights:
58 |         weights = weights.fwd
59 | 
60 |     state_dict = DollyModelTP.to_hf(weights, hf_model)
61 |     # check only missing keys are mask-related keys
62 |     hf_state_keys = hf_model.state_dict().keys()
63 |     popxl_keys = state_dict.keys()
64 | 
65 |     def should_check(k: str):
66 |         return "attn.bias" not in k and "attn.masked_bias" not in k
67 | 
68 |     for k in hf_state_keys:
69 |         if should_check(k) and k not in popxl_keys:
70 |             raise KeyError(f"key {k} not found in session state")
71 | 
72 |     hf_model.load_state_dict(state_dict, strict=False)
73 |     return hf_model
74 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .rotary_pos_embed import *
3 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/common.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_ROTARYPOSEMBED_OPIDS
 3 | #define GUARD_ROTARYPOSEMBED_OPIDS
 4 | 
 5 | #include <popart/attributes.hpp>
 6 | #include <popart/error.hpp>
 7 | #include <popart/names.hpp>
 8 | #include <popart/operatoridentifier.hpp>
 9 | 
10 | using InMapType = std::map<popart::InIndex, popart::TensorId>;
11 | using OutMapType = std::map<popart::OutIndex, popart::TensorId>;
12 | using OutIndex = int;
13 | 
14 | namespace popart {
15 | 
16 | #define CUSTOM_OP_DOMAIN "popxl.addons.ops"
17 | 
18 | const popart::OperatorIdentifier RotaryPosEmbed = OperatorIdentifier{
19 |     CUSTOM_OP_DOMAIN,
20 |     "RotaryPosEmbed",
21 |     1,      // Op version
22 |     {3, 3}, // number of inputs
23 |     1       // number of outputs
24 | };
25 | 
26 | const popart::OperatorIdentifier RotaryPosEmbedGrad = OperatorIdentifier{
27 |     CUSTOM_OP_DOMAIN,
28 |     "RotaryPosEmbedGrad",
29 |     1,      // Op version
30 |     {3, 3}, // number of inputs
31 |     1       // number of outputs
32 | };
33 | 
34 | } // namespace popart
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
  2 | #include <algorithm>
  3 | #include <cstdint>
  4 | #include <popart/graphcoreoperators.hpp>
  5 | #include <popart/names.hpp>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include <memory>
 10 | #include <popart/op.hpp>
 11 | #include <popart/opmanager.hpp>
 12 | #include <popart/opserialiser.hpp>
 13 | #include <popart/region.hpp>
 14 | #include <popart/tensor.hpp>
 15 | #include <popart/tensorindex.hpp>
 16 | #include <popart/util.hpp>
 17 | 
 18 | #include "common.hpp"
 19 | #include "rotary_pos_embed.hpp"
 20 | 
 21 | namespace popart {
 22 | 
 23 | /////////////////////////////////////////////////////////////
 24 | ////// Fwd op
 25 | 
 26 | RotaryPosEmbedOp::RotaryPosEmbedOp(const OperatorIdentifier &_opid,
 27 |                                    uint32_t rotary_dim_,
 28 |                                    const Op::Settings &settings_)
 29 |     : Op(_opid, settings_), rotary_dim{rotary_dim_} {
 30 |   if ((rotary_dim % 2) != 0) {
 31 |     throw error("RotaryPosEmbedOp::RotaryPosEmbedOp rotary_dim must be a "
 32 |                 "multiple of 2");
 33 |   }
 34 | }
 35 | 
 36 | std::unique_ptr<Op> RotaryPosEmbedOp::clone() const {
 37 |   return std::make_unique<RotaryPosEmbedOp>(*this);
 38 | }
 39 | 
 40 | std::vector<std::unique_ptr<Op>> RotaryPosEmbedOp::getGradOps() {
 41 |   std::vector<std::unique_ptr<Op>> result;
 42 |   result.push_back(std::make_unique<RotaryPosEmbedGradOp>(*this));
 43 |   return result;
 44 | }
 45 | 
 46 | void RotaryPosEmbedOp::setup() {
 47 |   auto xInfo = inInfo(0);
 48 |   auto cosInfo = inInfo(1);
 49 |   auto sinInfo = inInfo(2);
 50 | 
 51 |   // check expected shapes
 52 |   if (xInfo.rank() != 4) {
 53 |     throw error(
 54 |         "RotaryPosEmbedOp::setup x should have rank 4 (batch, heads, seq, hh)");
 55 |   }
 56 |   if (cosInfo.rank() != 3 || sinInfo.rank() != 3) {
 57 |     throw error("RotaryPosEmbedOp::setup trig functions should have rank 3 "
 58 |                 "(1 or batch, seq, hh/2)");
 59 |   }
 60 |   if ((rotary_dim % 2) != 0) {
 61 |     throw error("RotaryPosEmbedOp::setup rotary dim must be a multiple of 2");
 62 |   }
 63 | 
 64 |   // x rotated
 65 |   outInfo(0) = xInfo;
 66 | }
 67 | 
 68 | void RotaryPosEmbedOp::appendOutlineAttributes(OpSerialiserBase &os) const {
 69 |   os.appendAttribute("rotary_dim", rotary_dim);
 70 |   Op::appendOutlineAttributes(os);
 71 | }
 72 | 
 73 | /////////////////////////////////////////////////////////////
 74 | ////// Grad op
 75 | 
 76 | RotaryPosEmbedGradOp::RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op)
 77 |     : Op(RotaryPosEmbedGrad, op.getSettings()), rotary_dim{op.rotary_dim} {}
 78 | 
 79 | const std::map<int, int> &RotaryPosEmbedGradOp::gradOutToNonGradIn() const {
 80 |   static const std::map<int, int> outInfo = {{0, 0}};
 81 |   return outInfo;
 82 | }
 83 | 
 84 | const std::vector<GradInOutMapper> &
 85 | RotaryPosEmbedGradOp::gradInputInfo() const {
 86 |   static const std::vector<GradInOutMapper> inInfo = {
 87 |       {0, 0, GradOpInType::GradOut},
 88 |       {1, 1, GradOpInType::In},
 89 |       {2, 2, GradOpInType::In}};
 90 |   return inInfo;
 91 | }
 92 | 
 93 | void RotaryPosEmbedGradOp::setup() { outInfo(0) = inInfo(0); }
 94 | 
 95 | std::unique_ptr<Op> RotaryPosEmbedGradOp::clone() const {
 96 |   return std::make_unique<RotaryPosEmbedGradOp>(*this);
 97 | }
 98 | 
 99 | void RotaryPosEmbedGradOp::appendOutlineAttributes(OpSerialiserBase &os) const {
100 |   os.appendAttribute("rotary_dim", rotary_dim);
101 |   Op::appendOutlineAttributes(os);
102 | }
103 | 
104 | } // namespace popart
105 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_NEURALNET_STRIDEDSLICE_HPP
 3 | #define GUARD_NEURALNET_STRIDEDSLICE_HPP
 4 | 
 5 | #include <popart/graph.hpp>
 6 | #include <popart/op.hpp>
 7 | #include <popart/vendored/optional.hpp>
 8 | 
 9 | #include "common.hpp"
10 | 
11 | namespace popart {
12 | 
13 | class RotaryPosEmbedOp : public Op {
14 | public:
15 |   RotaryPosEmbedOp(const OperatorIdentifier &_opid, uint32_t rotary_dim_,
16 |                    const Op::Settings &settings_);
17 | 
18 |   std::unique_ptr<Op> clone() const override;
19 |   std::vector<std::unique_ptr<Op>> getGradOps() override;
20 |   void setup() final;
21 | 
22 |   float getSubgraphValue() const override { return getHighSubgraphValue(); }
23 | 
24 |   static RotaryPosEmbedOp *
25 |   createOpInGraph(popart::Graph &graph, const InMapType &in,
26 |                   const OutMapType &out, uint32_t rotary_dim_,
27 |                   const popart::Op::Settings &settings) {
28 |     return graph.createConnectedOp<RotaryPosEmbedOp>(in, out, RotaryPosEmbed,
29 |                                                      rotary_dim_, settings);
30 |   }
31 | 
32 |   void appendOutlineAttributes(OpSerialiserBase &) const override;
33 | 
34 |   uint32_t rotary_dim = 0;
35 | };
36 | 
37 | class RotaryPosEmbedGradOp : public Op {
38 | public:
39 |   RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op);
40 | 
41 |   void setup() final;
42 |   std::unique_ptr<Op> clone() const override;
43 |   const std::vector<GradInOutMapper> &gradInputInfo() const final;
44 |   const std::map<int, int> &gradOutToNonGradIn() const final;
45 | 
46 |   float getSubgraphValue() const override { return getHighSubgraphValue(); }
47 | 
48 |   void appendOutlineAttributes(OpSerialiserBase &) const override;
49 | 
50 |   uint32_t rotary_dim = 0;
51 | };
52 | 
53 | } // namespace popart
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpp:
--------------------------------------------------------------------------------
 1 | // cppimport
 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading
 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 4 | 
 5 | #include <map>
 6 | #include <memory>
 7 | #include <popart/alias/aliasmodel.hpp>
 8 | #include <popart/basicoptionals.hpp>
 9 | #include <popart/error.hpp>
10 | #include <popart/graph.hpp>
11 | #include <popart/ir.hpp>
12 | #include <popart/op.hpp>
13 | #include <popart/opmanager.hpp>
14 | #include <popart/opserialiser.hpp>
15 | #include <popart/popx/devicex.hpp>
16 | #include <popart/popx/irlowering.hpp>
17 | #include <popart/popx/opx.hpp>
18 | #include <popart/popx/opxmanager.hpp>
19 | #include <popart/region.hpp>
20 | #include <popart/tensor.hpp>
21 | #include <popart/util.hpp>
22 | #include <popart/vendored/optional.hpp>
23 | #include <poplar/Tensor.hpp>
24 | #include <vector>
25 | 
26 | #include <pybind11/numpy.h>
27 | #include <pybind11/operators.h>
28 | #include <pybind11/pybind11.h>
29 | #include <pybind11/stl.h>
30 | 
31 | #include "common.hpp"
32 | #include "rotary_pos_embed.hpp"
33 | #include "rotary_pos_embedx.hpp"
34 | 
35 | namespace py = pybind11;
36 | 
37 | // -------------- PyBind --------------
38 | // `rotary_pos_embed_binding` must equal filename
39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) {
40 |   // Bindings the parameters of the op: constructor + fields.
41 |   py::class_<popart::RotaryPosEmbedOp, popart::Op,
42 |              std::shared_ptr<popart::RotaryPosEmbedOp>>
43 |       binding(m, "RotaryPosEmbedOp");
44 |   binding.def_static(
45 |       "createOpInGraph",
46 |       py::overload_cast<popart::Graph &, const InMapType &, const OutMapType &,
47 |                         uint32_t, const popart::Op::Settings &>(
48 |           &popart::RotaryPosEmbedOp::createOpInGraph),
49 |       py::arg("graph"), py::arg("inputs"), py::arg("outputs"),
50 |       py::arg("rotaryDim"), py::arg("settings"),
51 |       py::return_value_policy::reference);
52 |   binding.def("outTensor",
53 |               py::overload_cast<OutIndex>(&popart::RotaryPosEmbedOp::outTensor),
54 |               py::return_value_policy::reference);
55 | };
56 | 
57 | // -------------- cppimport --------------
58 | // cppimport configuration for compiling the pybind11 module.
59 | // clang-format off
60 | /*
61 | <%
62 | cfg['sources'] = ['rotary_pos_embed.cpp', 'rotary_pos_embedx.cpp']
63 | cfg['extra_compile_args'] = ['-std=c++14', '-fPIC', '-O2', '-DONNX_NAMESPACE=onnx', '-Wall', '-Wno-sign-compare']
64 | cfg['libraries'] = ['popart', 'poputil', 'popops', 'poplin', 'popnn', 'poprand', 'gcl']
65 | setup_pybind11(cfg)
66 | %>
67 | */
68 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/modelling/rotary_pos_embed/rotary_pos_embedx.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP
 3 | #define GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP
 4 | 
 5 | #include <popart/names.hpp>
 6 | #include <popart/popx/opx.hpp>
 7 | #include <vector>
 8 | 
 9 | namespace popart {
10 | namespace popx {
11 | 
12 | class RotaryPosEmbedOpx : public Opx {
13 | public:
14 |   RotaryPosEmbedOpx(Op *, Devicex *);
15 | 
16 |   void grow(poplar::program::Sequence &) const;
17 | };
18 | 
19 | class RotaryPosEmbedGradOpx : public Opx {
20 | public:
21 |   RotaryPosEmbedGradOpx(Op *, Devicex *);
22 | 
23 |   void grow(poplar::program::Sequence &) const;
24 | };
25 | 
26 | } // namespace popx
27 | } // namespace popart
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy==1.10.1
 3 | 
 4 | 
 5 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0
 6 | transformers
 7 | tokenizers==0.13.3
 8 | accelerate==0.20.3
 9 | 
10 | pytest==6.2.5
11 | pytest-pythonpath==0.7.4
12 | 
13 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
14 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils@v3.2
15 | 
16 | -f https://download.pytorch.org/whl/torch_stable.html
17 | torch==2.0.1+cpu
18 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import os
 3 | from config import DollyConfig
 4 | 
 5 | import pytest
 6 | 
 7 | from config import DollyConfig
 8 | from utils.simple_parsing_tools import parse_args_with_config_file
 9 | 
10 | 
11 | def _test_config_file():
12 |     return os.path.join(os.path.dirname(__file__), "test_config.yml")
13 | 
14 | 
15 | @pytest.fixture
16 | def test_config_file():
17 |     return _test_config_file()
18 | 
19 | 
20 | @pytest.fixture
21 | def test_config():
22 |     return parse_args_with_config_file(DollyConfig, ["--config", _test_config_file()])
23 | 
24 | 
25 | # Below functions enable long tests to be skipped, unless a --long-test
26 | # cli option is specified.
27 | def pytest_addoption(parser):
28 |     parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests")
29 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/execution/test_execution.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | from examples_tests.test_util import SubProcessChecker
 7 | 
 8 | root_dir = Path(__file__).parent.parent.parent.parent.resolve()
 9 | 
10 | 
11 | def dolly_root_env_path():
12 |     env = os.environ
13 |     env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir)))
14 |     return env
15 | 
16 | 
17 | class TestExecution(SubProcessChecker):
18 |     def test_inference(self):
19 |         self.run_command(
20 |             "python3 inference.py --config tiny --layers 2 "
21 |             "--tensor_parallel 4 "
22 |             "--vocab_size 128 --sequence_length 16 "
23 |             "--hidden_size 128 --heads 8",
24 |             root_dir,
25 |             ["Duration"],
26 |             env=dolly_root_env_path(),
27 |         )
28 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/layers/test_attention_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.gpt_neox import GPTNeoXConfig as HFConfig
 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXAttention
 8 | 
 9 | import popxl
10 | 
11 | import popxl_addons as addons
12 | from popxl_addons.patterns import apply_pre_alias_patterns
13 | from config import DollyConfig
14 | from modelling.attention import DollySelfAttentionTP
15 | from popxl_addons.array_munging import repeat
16 | 
17 | 
18 | def test_attention_TP_cmp_huggingface(test_config: DollyConfig):
19 |     torch.manual_seed(42)
20 | 
21 |     batch_size = test_config.execution.micro_batch_size
22 |     seq_len = test_config.model.sequence_length
23 |     hidden_size = test_config.model.hidden_size
24 |     intermediate_size = hidden_size * 4
25 | 
26 |     # HuggingFace
27 |     config = HFConfig(
28 |         hidden_size=hidden_size,
29 |         max_position_embeddings=seq_len,
30 |         intermediate_size=intermediate_size,
31 |         num_attention_heads=test_config.model.attention.heads,
32 |         rotary_dim=test_config.model.attention.rotary_dim,
33 |     )
34 |     hf_model = GPTNeoXAttention(config).eval()
35 | 
36 |     # HF forward
37 |     input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True)
38 |     output_, *_ = hf_model(input_t, None)
39 |     output_HF = output_.detach().numpy()
40 | 
41 |     # TP
42 |     n_shards = test_config.execution.tensor_parallel
43 | 
44 |     # popxl
45 |     ir = popxl.Ir()
46 |     ir.replication_factor = n_shards
47 |     with ir.main_graph:
48 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
49 |             *[
50 |                 addons.host_load(
51 |                     input_t.reshape(-1, test_config.model.hidden_size), test_config.model.dtype, name="input"
52 |                 ),
53 |             ]
54 |         )
55 |         (x,) = inputs_tensors
56 | 
57 |         attn_args, attn_graph = DollySelfAttentionTP(test_config).create_graph(x)
58 | 
59 |         vars = attn_args.init()
60 |         fwd_info = attn_graph.bind(vars).call_with_info(x)
61 |         (acts,) = fwd_info.outputs
62 | 
63 |         fwd_d2h = addons.host_store(acts)
64 | 
65 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
66 |     apply_pre_alias_patterns(ir, level="default")
67 | 
68 |     weights = DollySelfAttentionTP.hf_mapping(test_config, vars, hf_model)
69 | 
70 |     inputs = {h2d: repeat(data, n_shards).squeeze() for h2d, data in zip(inputs_host_steam, inputs_data)}
71 | 
72 |     with popxl.Session(ir, "ipu_hw") as session:
73 |         session.write_variables_data(weights)
74 |         outputs_popxl = session.run(inputs)
75 | 
76 |     fwd_data = outputs_popxl[fwd_d2h]
77 | 
78 |     if n_shards > 1:
79 |         assert len(fwd_data) == n_shards
80 | 
81 |         # Assert all IPU outputs are identical
82 |         for i in range(1, n_shards):
83 |             np.testing.assert_equal(fwd_data[0], fwd_data[i])
84 |     else:
85 |         fwd_data = np.expand_dims(fwd_data, axis=0)
86 | 
87 |     # Assert nearly equal to HF
88 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 4)
89 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/layers/test_decoder_block_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig
 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
 8 | 
 9 | import popxl
10 | from popxl.utils import to_numpy
11 | 
12 | import popxl_addons as addons
13 | from popxl_addons.patterns import apply_pre_alias_patterns
14 | 
15 | from config import DollyConfig
16 | from modelling.decoder import DollyDecoderBlockTP
17 | from popxl_addons.array_munging import repeat
18 | 
19 | 
20 | def test_decoder_block_TP_cmp_huggingface(test_config: DollyConfig):
21 |     torch.manual_seed(42)
22 | 
23 |     batch_size = test_config.execution.micro_batch_size
24 |     seq_len = test_config.model.sequence_length
25 |     hidden_size = test_config.model.hidden_size
26 |     intermediate_size = hidden_size * 4
27 | 
28 |     # HuggingFace
29 |     config = HFConfig(
30 |         hidden_size=hidden_size,
31 |         max_position_embeddings=seq_len,
32 |         intermediate_size=intermediate_size,
33 |         num_attention_heads=test_config.model.attention.heads,
34 |         rotary_dim=test_config.model.attention.rotary_dim,
35 |         use_parallel_residual=True,
36 |     )
37 |     hf_model = GPTNeoXLayer(config).eval()
38 | 
39 |     # HF forward
40 |     input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True)
41 |     (output_,) = hf_model(input_t)
42 | 
43 |     output_HF = output_.detach().numpy()
44 | 
45 |     # TP
46 |     n_shards = test_config.execution.tensor_parallel
47 |     test_config.execution.tensor_parallel = n_shards
48 | 
49 |     # popxl
50 |     ir = popxl.Ir()
51 |     ir.replication_factor = n_shards
52 | 
53 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
54 | 
55 |     main = ir.main_graph
56 | 
57 |     with main:
58 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
59 |             *[
60 |                 addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"),
61 |             ]
62 |         )
63 |         (x,) = inputs_tensors
64 | 
65 |         args, graph = DollyDecoderBlockTP(test_config).create_graph(x)
66 | 
67 |         ff_vars = args.init()
68 |         ff = graph.bind(ff_vars)
69 |         fwd_info = ff.call_with_info(x)
70 |         (acts,) = fwd_info.outputs
71 | 
72 |         fwd_d2h = addons.host_store(acts)
73 | 
74 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
75 |     apply_pre_alias_patterns(ir, level="default")
76 | 
77 |     weights = DollyDecoderBlockTP.hf_mapping(test_config, ff_vars, hf_model)
78 | 
79 |     inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)}
80 | 
81 |     with popxl.Session(ir, "ipu_hw") as session:
82 |         session.write_variables_data(weights)
83 |         outputs_popxl = session.run(inputs)
84 | 
85 |     fwd_data = outputs_popxl[fwd_d2h]
86 | 
87 |     assert len(fwd_data) == n_shards
88 | 
89 |     # Assert all IPU outputs are identical
90 |     for i in range(1, n_shards):
91 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
92 |     # Assert nearly equal to HF
93 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
94 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/layers/test_feed_forward_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig
 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP
 8 | 
 9 | import popxl
10 | 
11 | import popxl_addons as addons
12 | from popxl_addons.patterns import apply_pre_alias_patterns
13 | 
14 | from config import DollyConfig
15 | from modelling.feed_forward import DollyFeedForwardTP
16 | from popxl_addons.array_munging import repeat
17 | 
18 | 
19 | def test_feed_forward_TP_cmp_huggingface(test_config: DollyConfig):
20 |     torch.manual_seed(42)
21 | 
22 |     batch_size = test_config.execution.micro_batch_size
23 |     seq_len = test_config.model.sequence_length
24 |     hidden_size = test_config.model.hidden_size
25 |     intermediate_size = hidden_size * 4
26 | 
27 |     # HuggingFace
28 |     config = HFConfig(
29 |         hidden_size=hidden_size,
30 |         max_position_embeddings=seq_len,
31 |         intermediate_size=intermediate_size,
32 |         num_attention_heads=test_config.model.attention.heads,
33 |     )
34 |     hf_model = GPTNeoXMLP(config).eval()
35 | 
36 |     # HF forward
37 |     input_t = torch.rand((batch_size, seq_len, hidden_size))
38 |     outputs = hf_model(input_t)
39 |     output_ = outputs.reshape(batch_size * seq_len, hidden_size)
40 |     output_HF = output_.detach().numpy()
41 | 
42 |     # TP
43 |     n_shards = test_config.execution.tensor_parallel
44 | 
45 |     # popxl
46 |     ir = popxl.Ir()
47 |     ir.replication_factor = n_shards
48 | 
49 |     main = ir.main_graph
50 | 
51 |     with main:
52 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
53 |             *[
54 |                 addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"),
55 |             ]
56 |         )
57 |         (x,) = inputs_tensors
58 | 
59 |         ff_args, ff_graph = DollyFeedForwardTP(test_config).create_graph(x)
60 | 
61 |         ff_vars = ff_args.init()
62 |         ff = ff_graph.bind(ff_vars)
63 |         fwd_info = ff.call_with_info(x)
64 |         (acts,) = fwd_info.outputs
65 | 
66 |         fwd_d2h = addons.host_store(acts)
67 | 
68 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
69 |     apply_pre_alias_patterns(ir, level="default")
70 | 
71 |     weights = DollyFeedForwardTP.hf_mapping(test_config, ff_vars, hf_model)
72 | 
73 |     inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)}
74 |     with popxl.Session(ir, "ipu_hw") as session:
75 |         session.write_variables_data(weights)
76 |         outputs_popxl = session.run(inputs)
77 | 
78 |     fwd_data = outputs_popxl[fwd_d2h]
79 | 
80 |     assert len(fwd_data) == n_shards
81 | 
82 |     # Assert all IPU outputs are identical
83 |     for i in range(1, n_shards):
84 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
85 |     # Assert nearly equal to HF
86 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
87 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/layers/test_lm_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.gpt_neox import GPTNeoXConfig as HFConfig
 7 | from transformers.models.gpt_neox import GPTNeoXForCausalLM
 8 | 
 9 | import popxl
10 | from popxl.utils import to_numpy
11 | 
12 | import popxl_addons as addons
13 | from popxl_addons.patterns import apply_pre_alias_patterns
14 | 
15 | from config import DollyConfig
16 | from modelling.embedding import DollyEmbeddingsTP
17 | from modelling.dolly_lm import DollyLMHeadModelTP
18 | 
19 | from popxl_addons.array_munging import shard
20 | 
21 | 
22 | def test_lm_TP_cmp_huggingface(test_config: DollyConfig):
23 |     torch.manual_seed(42)
24 |     batch_size = test_config.execution.micro_batch_size
25 |     hidden_size = test_config.model.hidden_size
26 |     intermediate_size = hidden_size * 4
27 |     seq_len = test_config.model.sequence_length
28 |     # HuggingFace
29 |     config = HFConfig(
30 |         num_hidden_layers=test_config.model.layers,
31 |         vocab_size=test_config.model.embedding.vocab_size,
32 |         hidden_size=hidden_size,
33 |         max_position_embeddings=seq_len,
34 |         intermediate_size=intermediate_size,
35 |         num_attention_heads=test_config.model.attention.heads,
36 |         rotary_dim=test_config.model.attention.rotary_dim,
37 |     )
38 |     hf_model = GPTNeoXForCausalLM(config).eval()
39 | 
40 |     # HF forward
41 |     input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length))
42 |     output_HF = hf_model(input_ids=input_t)[0]
43 |     output_HF = output_HF.detach().numpy()
44 | 
45 |     # n_shards
46 |     n_shards = test_config.execution.tensor_parallel
47 | 
48 |     # Offset inputs
49 |     words_offsetted = DollyEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t))
50 |     # popxl
51 |     ir = popxl.Ir()
52 |     ir.replication_factor = n_shards
53 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
54 |     main = ir.main_graph
55 | 
56 |     with main:
57 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
58 |             *[
59 |                 addons.host_load(words_offsetted[0], popxl.int32, name="words"),
60 |             ]
61 |         )
62 |         (words,) = inputs_tensors
63 |         facts, graph = DollyLMHeadModelTP(test_config).create_graph(words)
64 |         vars = facts.init()
65 |         gpt = graph.bind(vars)
66 |         call_info = gpt.call_with_info(words)
67 |         act, *_ = call_info.outputs
68 |         act_stream = addons.host_store(act)
69 | 
70 |     apply_pre_alias_patterns(ir, level="default")
71 | 
72 |     # Map weights from huggingface
73 |     weights = DollyLMHeadModelTP.hf_mapping(test_config, vars, hf_model)
74 | 
75 |     inputs = dict(zip(inputs_host_steam, [words_offsetted]))
76 | 
77 |     ir.num_host_transfers = test_config.execution.device_iterations
78 | 
79 |     with popxl.Session(ir, "ipu_hw") as session:
80 |         session.write_variables_data(weights)
81 |         outs = session.run(inputs)
82 | 
83 |     # Fwd output
84 |     fwd_data = outs[act_stream]
85 |     assert len(fwd_data) == n_shards
86 |     fwd_data_full = np.concatenate(fwd_data, axis=-1)[:, : test_config.model.embedding.vocab_size]
87 |     np.testing.assert_almost_equal(output_HF, fwd_data_full.reshape(output_HF.shape), 3)
88 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/integration/layers/test_model_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.gpt_neox.configuration_gpt_neox import GPTNeoXConfig as HFConfig
 7 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel
 8 | 
 9 | import popxl
10 | from popxl.utils import to_numpy
11 | 
12 | import popxl_addons as addons
13 | from popxl_addons.patterns import apply_pre_alias_patterns
14 | 
15 | from config import DollyConfig
16 | from modelling.embedding import DollyEmbeddingsTP
17 | from modelling.dolly_model import DollyModelTP
18 | 
19 | 
20 | def test_model_TP_cmp_huggingface(test_config: DollyConfig):
21 |     torch.manual_seed(42)
22 | 
23 |     batch_size = test_config.execution.micro_batch_size
24 |     hidden_size = test_config.model.hidden_size
25 |     seq_len = test_config.model.sequence_length
26 |     intermediate_size = hidden_size * 4
27 |     # HuggingFace
28 |     config = HFConfig(
29 |         num_hidden_layers=test_config.model.layers,
30 |         vocab_size=test_config.model.embedding.vocab_size,
31 |         hidden_size=hidden_size,
32 |         max_position_embeddings=seq_len,
33 |         intermediate_size=intermediate_size,
34 |         num_attention_heads=test_config.model.attention.heads,
35 |         rotary_dim=test_config.model.attention.rotary_dim,
36 |     )
37 |     hf_model = GPTNeoXModel(config).eval()
38 | 
39 |     # HF forward
40 |     input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length))
41 |     output_HF = hf_model(input_ids=input_t)[0]
42 |     output_HF = output_HF.detach().numpy()
43 | 
44 |     # TP
45 |     tp = test_config.execution.tensor_parallel
46 | 
47 |     # Offset inputs
48 |     words_offsetted = DollyEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t))
49 | 
50 |     # popxl
51 |     ir = popxl.Ir()
52 |     ir.replication_factor = tp
53 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
54 |     main = ir.main_graph
55 | 
56 |     with main:
57 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
58 |             *[
59 |                 addons.host_load(words_offsetted[0], popxl.int32, name="words"),
60 |             ]
61 |         )
62 |         (words,) = inputs_tensors
63 |         facts, graph = DollyModelTP(test_config).create_graph(words)
64 | 
65 |         vars = facts.init()
66 |         gpt = graph.bind(vars)
67 |         call_info = gpt.call_with_info(words)
68 |         act, *_ = call_info.outputs
69 |         act_stream = addons.host_store(act)
70 | 
71 |     apply_pre_alias_patterns(ir, level="default")
72 | 
73 |     # Map weights from huggingface
74 |     weights = DollyModelTP.hf_mapping(test_config, vars, hf_model)
75 | 
76 |     inputs = dict(zip(inputs_host_steam, [words_offsetted]))
77 | 
78 |     ir.num_host_transfers = test_config.execution.device_iterations
79 | 
80 |     with popxl.Session(ir, "ipu_hw") as session:
81 |         session.write_variables_data(weights)
82 |         outs = session.run(inputs)
83 | 
84 |     # Fwd output
85 |     fwd_data = outs[act_stream]
86 | 
87 |     assert len(fwd_data) == tp
88 |     for i in range(1, tp):
89 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
90 | 
91 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
92 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/tests/test_config.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   sequence_length: 256 # 8
 3 |   embedding:
 4 |     vocab_size: 128
 5 |   hidden_size: 128
 6 |   layers: 2
 7 |   attention:
 8 |     heads: 4
 9 |     rotary_dim: 8
10 |   precision: "float32"
11 | execution:
12 |   micro_batch_size: 1
13 |   data_parallel: 1
14 |   tensor_parallel: 4
15 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/dolly2-instruction-following/utils/simple_parsing_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from examples_utils.parsing.simple_parsing_tools import *
4 | 


--------------------------------------------------------------------------------
/gptj-text-generation/api.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from utils.trainer import GPTJTrainer
4 | from utils.pipeline import GPTJPipeline, GPTJEntailmentPipeline
5 | 


--------------------------------------------------------------------------------
/gptj-text-generation/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from .config import GPTJConfig, Execution
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | CONFIG_DIR = Path(os.path.dirname(__file__))
 8 | 
 9 | del os, Path
10 | 


--------------------------------------------------------------------------------
/gptj-text-generation/config/finetuning.yml:
--------------------------------------------------------------------------------
 1 | # -------- Models --------
 2 | "gptj_6B_1024": &gptj_6B_1024
 3 |   model:
 4 |     sequence_length: 1024
 5 |     layers: 28
 6 |     hidden_size: 4096
 7 |     dropout_prob: 0.0
 8 |     attention:
 9 |       heads: 16
10 |       rotary_positional_embeddings_base: 10000
11 |       rotary_dim: 64
12 |     embedding:
13 |       vocab_size: 50400
14 |   training:
15 |     global_batch_size: 128
16 |     steps: 500
17 |     optimizer:
18 |       optimizer: adamw
19 |       learning_rate:
20 |         maximum: 5e-06
21 |         warmup_proportion: 0.005995
22 |       weight_decay: 0.0
23 | 
24 | tiny: &tiny
25 |   model:
26 |     sequence_length: 8
27 |     embedding:
28 |       vocab_size: 128
29 |     hidden_size: 64
30 |     layers: 2
31 |     attention:
32 |       heads: 4
33 |       rotary_dim: 16
34 |   training:
35 |     global_batch_size: 16
36 |     steps: 100000
37 |     optimizer:
38 |       optimizer: adamw
39 |       learning_rate:
40 |         maximum: 0.00001
41 |         warmup_proportion: 0.00625
42 |       weight_decay: 0.01
43 | 
44 | # -------------------------
45 | 
46 | 
47 | # ------- Execution -------
48 | release:
49 |   "gptj_6B_1024_pod64":
50 |     <<: *gptj_6B_1024
51 |     execution:
52 |       micro_batch_size: 1
53 |       loss_scaling: 4096
54 |       io_tiles: 128
55 |       data_parallel: 4
56 |       tensor_parallel: 16
57 |       available_memory_proportion: [ 0.2 ]
58 |       attention_serialisation: 2
59 | 
60 |   "gptj_6B_1024_pod16":
61 |     <<: *gptj_6B_1024
62 |     execution:
63 |       micro_batch_size: 1
64 |       loss_scaling: 4096
65 |       io_tiles: 128
66 |       data_parallel: 1
67 |       tensor_parallel: 16
68 |       available_memory_proportion: [ 0.2 ]
69 |       attention_serialisation: 2
70 | 
71 |   tiny:
72 |     <<: *tiny
73 |     execution:
74 |       io_tiles: 64
75 |       micro_batch_size: 1
76 |       data_parallel: 2
77 |       tensor_parallel: 4
78 |       attention_serialisation: 2
79 | 


--------------------------------------------------------------------------------
/gptj-text-generation/config/inference.yml:
--------------------------------------------------------------------------------
 1 | # -------- Models --------
 2 | tiny: &tiny
 3 |   model:
 4 |     eval: true
 5 |     layers: 2
 6 |     hidden_size: 64
 7 |     sequence_length: 8
 8 |     attention:
 9 |       heads: 4
10 |       rotary_dim: 16
11 |     embedding:
12 |       vocab_size: 128
13 | 
14 | gpt-j: &gpt-j
15 |   model:
16 |     eval: true
17 |     layers: 28
18 |     hidden_size: 4096
19 |     sequence_length: 1024
20 |     attention:
21 |       heads: 16
22 |       rotary_positional_embeddings_base: 10000
23 |       rotary_dim: 64
24 |     embedding:
25 |       vocab_size: 50400
26 | # -------------------------
27 | 
28 | # ------- Execution -------
29 | release:
30 |   tiny:
31 |     <<: *tiny
32 |     execution:
33 |       micro_batch_size: 1
34 |       available_memory_proportion: [ 0.4 ]
35 |       tensor_parallel: 4
36 | 
37 |   gpt-j:
38 |     <<: *gpt-j
39 |     execution:
40 |       micro_batch_size: 12
41 |       available_memory_proportion: [ 0.4 ]
42 |       tensor_parallel: 4
43 | 
44 |   gpt-j-gq-4bit:
45 |     <<: *gpt-j
46 |     execution:
47 |       micro_batch_size: 12
48 |       available_memory_proportion: [ 0.4 ]
49 |       tensor_parallel: 4
50 |       group_quantise_weights: 64
51 | 
52 |   gpt-j-mnli:
53 |     <<: *gpt-j
54 |     execution:
55 |       micro_batch_size: 16
56 |       available_memory_proportion: [ 0.4 ]
57 |       tensor_parallel: 4
58 | 
59 |   gpt-j-mnli-gq-4bit:
60 |     <<: *gpt-j
61 |     execution:
62 |       micro_batch_size: 16
63 |       available_memory_proportion: [ 0.4 ]
64 |       tensor_parallel: 4
65 |       group_quantise_weights: 64
66 | 


--------------------------------------------------------------------------------
/gptj-text-generation/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/gptj-text-generation/data/hf_data_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #
 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # This file has been modified by Graphcore Ltd.
18 | 
19 | from itertools import chain
20 | from config import GPTJConfig
21 | 
22 | 
23 | def group_texts(config: GPTJConfig):
24 |     seq_len_1 = config.model.sequence_length + 1
25 | 
26 |     def func(examples):
27 |         # Concatenate all texts.
28 |         inputs = list(chain(*examples["input_ids"]))
29 |         total_length = len(inputs)
30 |         # We drop the small remainder instead of padding
31 |         if total_length >= seq_len_1:
32 |             total_length = (total_length // seq_len_1) * seq_len_1
33 |         # Split by chunks of max_len.
34 |         data = [inputs[i : i + seq_len_1] for i in range(0, total_length, seq_len_1)]
35 |         result = {
36 |             "input_ids": [d[:-1] for d in data],
37 |             "labels": [d[1:] for d in data],
38 |         }
39 |         return result
40 | 
41 |     return func
42 | 


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/bs_buffers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/bs_buffers.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/data_parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/data_parallelism.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/dp_tp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/dp_tp.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/execution.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/execution.jpg


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/gq-speed-accuracy-tradeoff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/gq-speed-accuracy-tradeoff.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/mnli_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/mnli_dataset.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/rts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/rts.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/tensor_parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tensor_parallelism.png


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/tp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tp.jpg


--------------------------------------------------------------------------------
/gptj-text-generation/imgs/tp_dp_rts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/gptj-text-generation/imgs/tp_dp_rts.png


--------------------------------------------------------------------------------
/gptj-text-generation/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/gptj-text-generation/modelling/gptj_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | from config import GPTJConfig
 5 | import torch
 6 | 
 7 | import popxl
 8 | from popxl.utils import to_numpy
 9 | 
10 | import popxl_addons as addons
11 | from popxl_addons import NamedTensors
12 | from popxl_addons.named_tensors import NamedTensorData
13 | 
14 | from popxl_addons.layers import LayerNorm
15 | 
16 | from .embedding import GPTJEmbeddingsTP
17 | from .decoder import GPTJDecoderTP, GPTJDecoderBlockTP
18 | 
19 | from transformers.models.gptj.modeling_gptj import GPTJModel as HFModel
20 | from transformers.models.gptj.configuration_gptj import GPTJConfig as GPTJConfigHF
21 | 
22 | 
23 | class GPTJModelTP(addons.Module):
24 |     def __init__(self, config: GPTJConfig, include_layer_norm=True):
25 |         super().__init__()
26 |         self.config = config
27 |         # sharded, then last bit identical
28 |         self.embeddings = GPTJEmbeddingsTP(self.config)
29 |         # identical inputs, then sharded, then identical
30 |         self.decoder = GPTJDecoderTP(self.config)
31 |         # identical
32 |         self.include_layer_norm = include_layer_norm
33 |         if self.include_layer_norm:
34 |             self.ln_f = LayerNorm()
35 | 
36 |     def build(self, input_ids: popxl.Tensor):
37 |         x = self.embeddings(input_ids)
38 |         x = self.decoder(x)
39 |         if self.include_layer_norm:
40 |             x = self.ln_f(x)
41 |         return x
42 | 
43 |     @staticmethod
44 |     def hf_mapping(
45 |         config: GPTJConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True
46 |     ) -> Dict[popxl.Tensor, np.ndarray]:
47 |         dtype = config.model.dtype
48 |         weights = {}
49 |         if layer_norm:
50 |             weights = {
51 |                 variables.ln_f.weight: to_numpy(hf_model.ln_f.weight.data, dtype),
52 |                 variables.ln_f.bias: to_numpy(hf_model.ln_f.bias.data, dtype),
53 |             }
54 | 
55 |         weights.update(GPTJEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model))
56 | 
57 |         for l in range(config.model.layers):
58 |             weights.update(GPTJDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.h[l]))
59 | 
60 |         return weights
61 | 
62 |     @staticmethod
63 |     def to_hf(variables_data: NamedTensorData, hf_model: HFModel, layer_norm=True) -> Dict[str, torch.Tensor]:
64 |         state_dict = {}
65 |         if layer_norm:
66 |             state_dict["ln_f.weight"] = torch.tensor(variables_data.ln_f.weight, dtype=hf_model.config.torch_dtype)
67 |             state_dict["ln_f.bias"] = torch.tensor(variables_data.ln_f.bias, dtype=hf_model.config.torch_dtype)
68 | 
69 |         state_dict.update(GPTJEmbeddingsTP.to_hf(hf_model.config, variables_data.embeddings, hf_model.wte))
70 |         for l in range(hf_model.config.n_layer):
71 |             state_dict.update(
72 |                 {
73 |                     "h." + str(l) + "." + k: v
74 |                     for k, v in GPTJDecoderBlockTP.to_hf(
75 |                         hf_model.config, variables_data.decoder[l], hf_model.h[l]
76 |                     ).items()
77 |                 }
78 |             )
79 |         return state_dict
80 | 


--------------------------------------------------------------------------------
/gptj-text-generation/modelling/hf_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from typing import Dict
 4 | import numpy as np
 5 | 
 6 | from transformers.models.gptj import GPTJModel as HFModel
 7 | from transformers.models.gptj import GPTJForCausalLM as HFLMHeadModel
 8 | from transformers.models.gptj.configuration_gptj import GPTJConfig as GPTJConfigHF
 9 | 
10 | import popxl
11 | from popxl_addons import TaskSession
12 | 
13 | from config import GPTJConfig
14 | from modelling.gptj_model import GPTJModelTP
15 | from modelling.gptj_lm import GPTJLMHeadLossAndGradTP, GPTJLMHeadModelTP
16 | 
17 | 
18 | def hf_mapping_lm_tp(
19 |     config: GPTJConfig, session: TaskSession, pretrained: HFLMHeadModel
20 | ) -> Dict[popxl.Tensor, np.ndarray]:
21 |     load_to = session.state
22 |     if "fwd" in session.state:
23 |         load_to = session.state.fwd
24 |     weights = GPTJLMHeadModelTP.hf_mapping(config, load_to, pretrained)
25 |     return weights
26 | 
27 | 
28 | def hf_mapping_TP(config: GPTJConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
29 |     load_to = session.state
30 |     if "fwd" in session.state:
31 |         load_to = session.state.fwd
32 |     weights = GPTJModelTP.hf_mapping(config, load_to, pretrained)
33 |     return weights
34 | 
35 | 
36 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel:
37 |     weights = session.get_named_tensors_data()
38 |     if "fwd" in weights:
39 |         weights = weights.fwd
40 |     state_dict = GPTJLMHeadModelTP.to_hf(weights, hf_model)
41 |     # check only missing keys are mask-related keys
42 |     hf_state_keys = hf_model.state_dict().keys()
43 |     popxl_keys = state_dict.keys()
44 | 
45 |     def should_check(k: str):
46 |         return "attn.bias" not in k and "attn.masked_bias" not in k
47 | 
48 |     for k in hf_state_keys:
49 |         if should_check(k) and k not in popxl_keys:
50 |             raise KeyError(f"key {k} not found in session state")
51 | 
52 |     hf_model.load_state_dict(state_dict, strict=False)
53 |     return hf_model
54 | 
55 | 
56 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel:
57 |     weights = session.get_named_tensors_data()
58 |     if "fwd" in weights:
59 |         weights = weights.fwd
60 | 
61 |     state_dict = GPTJModelTP.to_hf(weights, hf_model)
62 |     # check only missing keys are mask-related keys
63 |     hf_state_keys = hf_model.state_dict().keys()
64 |     popxl_keys = state_dict.keys()
65 | 
66 |     def should_check(k: str):
67 |         return "attn.bias" not in k and "attn.masked_bias" not in k
68 | 
69 |     for k in hf_state_keys:
70 |         if should_check(k) and k not in popxl_keys:
71 |             raise KeyError(f"key {k} not found in session state")
72 | 
73 |     hf_model.load_state_dict(state_dict, strict=False)
74 |     return hf_model
75 | 


--------------------------------------------------------------------------------
/gptj-text-generation/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | testpaths =
 3 |   tests
 4 | 
 5 | addopts =
 6 |   -r a
 7 |   -v
 8 | 
 9 | python_paths = . ../../../utils/
10 | 


--------------------------------------------------------------------------------
/gptj-text-generation/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-pythonpath
3 | 


--------------------------------------------------------------------------------
/gptj-text-generation/requirements.txt:
--------------------------------------------------------------------------------
 1 | --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
 2 | 
 3 | 
 4 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
 5 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@7cd37a8eccabe88e3741eef2c31bafd4fcd30c4c
 6 | pyyaml==5.4.1
 7 | dataclasses==0.8; python_version < '3.7'
 8 | transformers==4.25.0
 9 | datasets
10 | evaluate==0.4.0
11 | tfrecord==1.14.1
12 | torch==2.0.1+cpu
13 | scipy>=1.5.4
14 | more-itertools==8.13.0
15 | wandb==0.12.8
16 | sklearn==0.0
17 | 
18 | pytest==6.2.5
19 | pytest-pythonpath==0.7.4
20 | 
21 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0
22 | 
23 | protobuf==3.20.*; python_version > '3.6'
24 | 


--------------------------------------------------------------------------------
/gptj-text-generation/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | import os
 3 | from config import GPTJConfig
 4 | 
 5 | import pytest
 6 | 
 7 | from config import GPTJConfig
 8 | from utils.simple_parsing_tools import parse_args_with_config_file
 9 | 
10 | 
11 | def _test_config_file():
12 |     return os.path.join(os.path.dirname(__file__), "test_config.yml")
13 | 
14 | 
15 | @pytest.fixture
16 | def test_config_file():
17 |     return _test_config_file()
18 | 
19 | 
20 | @pytest.fixture
21 | def test_config():
22 |     return parse_args_with_config_file(GPTJConfig, ["--config", _test_config_file()])
23 | 
24 | 
25 | # Below functions enable long tests to be skipped, unless a --long-test
26 | # cli option is specified.
27 | def pytest_addoption(parser):
28 |     parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests")
29 | 


--------------------------------------------------------------------------------
/gptj-text-generation/tests/integration/execution/test_execution.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | from pathlib import Path
 3 | from examples_tests.test_util import SubProcessChecker
 4 | import os
 5 | import sys
 6 | 
 7 | root_dir = Path(__file__).parent.parent.parent.parent.resolve()
 8 | 
 9 | 
10 | def gptj_root_env_path():
11 |     env = os.environ
12 |     env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir)))
13 |     return env
14 | 
15 | 
16 | class TestPretraining(SubProcessChecker):
17 |     def test_finetuning(self):
18 |         self.run_command(
19 |             "python3 finetuning.py --config tiny --layers 3 "
20 |             "--global_batch_size 16 --micro_batch_size 2 --data_parallel 2 --tensor_parallel 2 "
21 |             "--vocab_size 128 --sequence_length 8 --rotary_dim 16 "
22 |             "--hidden_size 64 --heads 4",
23 |             root_dir,
24 |             ["Duration"],
25 |             env=gptj_root_env_path(),
26 |         )
27 | 
28 |     def test_inference(self):
29 |         self.run_command(
30 |             "python3 inference.py --config tiny --layers 3 "
31 |             "--micro_batch_size 16 --data_parallel 1 --tensor_parallel 2 "
32 |             "--vocab_size 128 --sequence_length 8 --rotary_dim 16 "
33 |             "--hidden_size 64 --heads 4",
34 |             root_dir,
35 |             ["Duration"],
36 |             env=gptj_root_env_path(),
37 |         )
38 | 


--------------------------------------------------------------------------------
/gptj-text-generation/tests/test_config.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   sequence_length: 8
 3 |   embedding:
 4 |     vocab_size: 128
 5 |   hidden_size: 64
 6 |   layers: 2
 7 |   attention:
 8 |     heads: 4
 9 |     rotary_dim: 8
10 |   eval: True
11 |   precision: "float32"
12 | training:
13 |   global_batch_size: 2
14 | execution:
15 |   micro_batch_size: 2
16 |   data_parallel: 1
17 |   attention_serialisation: 2
18 | 


--------------------------------------------------------------------------------
/gptj-text-generation/tests_serial/distributed_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | import popxl
 3 | from popxl import ops
 4 | from data.data_utils import DistributedSampler, WorkerInit
 5 | from torch.utils.data import DataLoader
 6 | from typing import Tuple
 7 | import numpy as np
 8 | from pathlib import Path
 9 | from mpi4py import MPI
10 | import time
11 | import popdist
12 | 
13 | 
14 | def sample_program(input_shape: Tuple, replicas: int):
15 |     ir = popxl.Ir("popdist")
16 | 
17 |     with ir.main_graph:
18 |         x_h2d = popxl.h2d_stream(input_shape, dtype=popxl.float32, name="x_in")
19 |         x_d2h = popxl.d2h_stream(input_shape, dtype=popxl.float32, name="x_out")
20 |         x = ops.host_load(x_h2d)
21 |         ops.host_store(x_d2h, x)
22 | 
23 |     ir.num_host_transfers = 1
24 |     return popxl.Session(ir, "ipu_hw"), x_h2d, x_d2h
25 | 
26 | 
27 | def distributed_sampler():
28 |     bs = 2
29 |     inps = 5
30 |     dataset_size = 2 * 10
31 |     worker_seed = 47
32 |     workers = 4
33 |     epochs = 3
34 |     replicas = 2
35 | 
36 |     dataset = np.random.random((dataset_size, inps)).astype(np.float32)
37 |     sampler = DistributedSampler(dataset)
38 |     dl = DataLoader(
39 |         dataset,
40 |         batch_size=bs,
41 |         drop_last=True,
42 |         num_workers=workers,
43 |         worker_init_fn=WorkerInit(worker_seed),
44 |         persistent_workers=workers > 0,
45 |         sampler=sampler,
46 |     )
47 |     session, in_stream, out_stream = sample_program((bs, inps), replicas)
48 | 
49 |     # check each instance get different data
50 |     loader_list = list(dl)[0][0][0].numpy()
51 | 
52 |     # MPI to broadcast data in root=1 to root=0
53 |     comm = MPI.COMM_WORLD
54 |     rank = comm.Get_rank()
55 |     loader_list_copy = np.copy(loader_list)
56 |     comm.Bcast(loader_list, root=1)
57 | 
58 |     # Assert if data broadcast to root=0 is different
59 |     if comm.Get_rank() == 0 and not np.all(loader_list_copy == loader_list):
60 |         print("Passed test: instances have different data")
61 | 
62 |     # Wait until both roots are finished
63 |     time.sleep(2)
64 | 
65 |     # check epochs behaviour
66 |     epochs_first_data = []
67 |     for epoch in range(epochs):
68 |         # set epoch explicitly before iterating dl
69 |         sampler.set_epoch(epoch)
70 |         step = 0
71 |         for data in dl:
72 |             x = data
73 |             with session:
74 |                 out = session.run({in_stream: x})[out_stream]
75 |                 if step == 0:
76 |                     epochs_first_data.append(out)
77 |             step += 1
78 | 
79 |     assert len(epochs_first_data) == epochs, f"Expected {epochs} elements to compare, found {len(epochs_first_data)}"
80 |     # check each epoch data is sampled in different order
81 |     for first_item in epochs_first_data[1:]:
82 |         not np.all(first_item == epochs_first_data[0])
83 |         print("Passed test: each epoch samples dataset in different order")
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     distributed_sampler()
88 | 


--------------------------------------------------------------------------------
/gptj-text-generation/tests_serial/test_distributed_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | from examples_tests.test_util import SubProcessChecker
 3 | from pathlib import Path
 4 | import os
 5 | import subprocess
 6 | import pytest
 7 | import gc
 8 | 
 9 | root_dir = Path(__file__).parent.parent.resolve()
10 | 
11 | 
12 | def run_poprun_cmdline(poprun_args, cmdline_args, script):
13 |     cmd = ["poprun"]
14 |     cmd.extend([str(item) for sublist in poprun_args.items() for item in sublist if item != ""])
15 |     cmd.append("python3")
16 |     cmd.append(script)
17 |     cmd.extend([str(item) for sublist in cmdline_args.items() for item in sublist if item != ""])
18 |     try:
19 |         out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=root_dir)
20 |     except subprocess.CalledProcessError as e:
21 |         print(f"TEST FAILED")
22 |         print(f"stdout={e.stdout.decode('utf-8',errors='ignore')}")
23 |         print(f"stderr={e.stderr.decode('utf-8',errors='ignore')}")
24 |         raise
25 |     return out, out.stdout.decode("utf-8"), out.stderr.decode("utf-8")
26 | 
27 | 
28 | def test_poprun_dataset():
29 |     """
30 |     Launch poprun as subprocess and assert output
31 |     """
32 |     gc.collect()
33 |     out, stdout, stderr = run_poprun_cmdline(
34 |         {
35 |             "--num-instances": 2,
36 |             "--num-replicas": 2,
37 |         },
38 |         {},
39 |         os.path.join(root_dir, "tests_serial/distributed_sampler.py"),
40 |     )
41 |     assert "Passed test: instances have different data" in stdout, stderr
42 |     assert "Passed test: each epoch samples dataset in different order" in stdout, stderr
43 | 
44 | 
45 | def test_poprun_dataloader_checkpoints():
46 |     """
47 |     Launch poprun as subprocess and assert output
48 |     """
49 |     gc.collect()
50 |     out, stdout, stderr = run_poprun_cmdline(
51 |         {
52 |             "--num-instances": 2,
53 |             "--num-replicas": 2,
54 |         },
55 |         {},
56 |         os.path.join(root_dir, "tests_serial/dataloader_checkpoints.py"),
57 |     )
58 |     assert "Passed test: distributed dataloader checkpoint" in stdout, stderr
59 | 


--------------------------------------------------------------------------------
/gptj-text-generation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
2 | from .simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/gptj-text-generation/utils/simple_parsing_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
2 | 
3 | from examples_utils.parsing.simple_parsing_tools import *
4 | 


--------------------------------------------------------------------------------
/images/folder_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/folder_logo.png


--------------------------------------------------------------------------------
/images/go_emotions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/go_emotions.png


--------------------------------------------------------------------------------
/images/jupyter_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/images/jupyter_logo.png


--------------------------------------------------------------------------------
/llama2-chatbot/.gitignore:
--------------------------------------------------------------------------------
1 | .graphcore/
2 | .ipynb_checkpoints/
3 | .exe_cache/
4 | .__pycache__/
5 | *.pyc
6 | 


--------------------------------------------------------------------------------
/llama2-chatbot/api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from .pipeline import LlamaPipeline
4 | 


--------------------------------------------------------------------------------
/llama2-chatbot/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from .config import LlamaConfig, Execution
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | CONFIG_DIR = Path(os.path.dirname(__file__))
 8 | 
 9 | del os, Path
10 | 


--------------------------------------------------------------------------------
/llama2-chatbot/config/inference.yml:
--------------------------------------------------------------------------------
 1 | # -------- Models --------
 2 | tiny: &tiny
 3 |   model:
 4 |     layers: 2
 5 |     hidden_size: 100
 6 |     sequence_length: 64
 7 |     attention:
 8 |       heads: 4
 9 |       rotary_dim: 4
10 |     embedding:
11 |       vocab_size: 150
12 | 
13 | llama2_7b: &llama2_7b
14 |   model:
15 |     layers: 32
16 |     hidden_size: 4096
17 |     intermediate_size: 11008    
18 |     sequence_length: 2048
19 |     attention:
20 |       heads: 32
21 |     embedding:
22 |       vocab_size: 32000
23 |     eps: 1.0e-6
24 | 
25 | llama2_13b: &llama2_13b
26 |   model:
27 |     layers: 40
28 |     hidden_size: 5120
29 |     intermediate_size: 13824
30 |     sequence_length: 2048
31 |     attention:
32 |       heads: 40
33 |     embedding:
34 |       vocab_size: 32000
35 |     eps: 1.0e-5
36 | 
37 | # -------------------------
38 | 
39 | # ------- Execution -------
40 | release:
41 |   tiny:
42 |     <<: *tiny
43 |     execution:
44 |       micro_batch_size: 4
45 |       available_memory_proportion: [ 0.4 ]
46 |       tensor_parallel: 4
47 | 
48 |   llama2_7b_pod2:
49 |     <<: *llama2_7b
50 |     execution:
51 |       micro_batch_size: 1
52 |       available_memory_proportion: [ 0.1 ]
53 |       tensor_parallel: 2
54 |     
55 |   llama2_7b_pod4:
56 |     <<: *llama2_7b
57 |     execution:
58 |       micro_batch_size: 1
59 |       available_memory_proportion: [ 0.4 ]
60 |       tensor_parallel: 4
61 | 
62 |   llama2_7b_pod16:
63 |     <<: *llama2_7b
64 |     execution:
65 |       micro_batch_size: 1
66 |       available_memory_proportion: [ 0.4 ]
67 |       tensor_parallel: 16
68 | 
69 |   llama2_13b_pod4:
70 |     <<: *llama2_13b
71 |     execution:
72 |       micro_batch_size: 1
73 |       available_memory_proportion: [ 0.4 ]
74 |       tensor_parallel: 4
75 |   
76 |   # TODO: Add attention padding to support attention tensor-parallel up to 16
77 |   llama2_13b_pod16:
78 |     <<: *llama2_13b
79 |     execution:
80 |       micro_batch_size: 1
81 |       available_memory_proportion: [ 0.4 ]
82 |       tensor_parallel: 16
83 |       attention_tensor_parallel: 4
84 |     


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | 
 5 | import popxl
 6 | from popxl import ops
 7 | from popxl.utils import to_numpy
 8 | 
 9 | import popxl_addons as addons
10 | from popxl_addons import NamedTensors
11 | 
12 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer as HFModel
13 | 
14 | from config import LlamaConfig
15 | from .attention import LlamaSelfAttentionTP
16 | from .feed_forward import LlamaFeedForwardTP
17 | from .rms_norm import LlamaRMSNorm
18 | 
19 | 
20 | class LlamaDecoderBlockTP(addons.Module):
21 |     def __init__(self, config: LlamaConfig):
22 |         super().__init__()
23 |         self.config = config
24 |         # begins with identical computations: layer norm ln_1
25 |         self.ln_1 = LlamaRMSNorm(self.config)
26 |         # attention is sharded
27 |         # identical computation for bias and skip connection
28 |         self.attention = LlamaSelfAttentionTP(self.config)
29 |         # begins with identical computations: layer norm ln_2
30 |         self.ln_2 = LlamaRMSNorm(self.config)
31 |         # feed forward is sharded
32 |         # identical computation for bias, dropout and skip connection
33 |         self.feed_forward = LlamaFeedForwardTP(self.config)
34 | 
35 |     def build(self, x: popxl.Tensor):
36 |         initial_residual = x
37 |         ax = self.ln_1(x)
38 |         ax = self.attention(ax)
39 |         ax = initial_residual + ax
40 | 
41 |         post_attn_residual = ax
42 |         fx = self.ln_2(ax)
43 |         fx = self.feed_forward(fx)
44 | 
45 |         hs = post_attn_residual + fx
46 |         return hs
47 | 
48 | 
49 |     @staticmethod
50 |     def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
51 |         
52 |         dtype = config.model.dtype
53 |         weights = {
54 |             variables.ln_1.weight: to_numpy(hf_model.input_layernorm.weight.data, dtype),
55 |             variables.ln_2.weight: to_numpy(hf_model.post_attention_layernorm.weight.data, dtype),
56 |         }
57 |         weights.update(LlamaSelfAttentionTP.hf_mapping(config, variables.attention, hf_model.self_attn))
58 |         weights.update(LlamaFeedForwardTP.hf_mapping(config, variables.feed_forward, hf_model.mlp))
59 | 
60 |         return weights
61 | 
62 | 
63 | class LlamaDecoderTP(addons.Module):
64 |     def __init__(self, config: LlamaConfig):
65 |         super().__init__()
66 |         self.config = config
67 | 
68 |     def build(self, x: popxl.Tensor):
69 | 
70 |         facts, graph = LlamaDecoderBlockTP(self.config).create_graph(x)  # Outline GPT Layer
71 | 
72 |         for i in range(self.config.model.layers):
73 |             args_nt = self.add_variable_inputs(i, facts)
74 |             (x,) = graph.bind(args_nt).call(x)
75 | 
76 |         return x
77 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/feed_forward.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from typing import Optional, List, Dict
 3 | import popxl
 4 | from popxl import ops
 5 | from popxl.utils import to_numpy
 6 | from transformers.models.llama.modeling_llama import LlamaMLP as HFModel
 7 | 
 8 | import popxl_addons as addons
 9 | from config import LlamaConfig
10 | from popxl_addons.layers import Linear
11 | import numpy as np
12 | 
13 | from popxl_addons.named_tensors import NamedTensors
14 | from popxl_addons.ops.replicated_all_reduce_TP import replicated_all_reduce
15 | from popxl_addons.array_munging import shard
16 | 
17 | 
18 | class LlamaFeedForwardTP(addons.Module):
19 |     def __init__(self, config: LlamaConfig):
20 |         super().__init__()
21 |         self.config = config
22 |         tp = config.execution.tensor_parallel
23 |         dp = config.execution.data_parallel
24 |         self.n_shards = tp
25 |         self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=dp)
26 |         self.intermediate_size = self.config.model.intermediate_size
27 |         self.hidden_size = self.config.model.hidden_size
28 | 
29 |         # ----- Layers -----
30 |         # Sharded across devices - column wise
31 |         self.gate_proj = Linear(
32 |             self.intermediate_size // self.n_shards, bias=False, replica_grouping=self.replica_grouping)
33 |         self.up_proj = Linear(
34 |             self.intermediate_size // self.n_shards, bias=False, replica_grouping=self.replica_grouping)
35 | 
36 |         # Sharded across devices - row wise (no bias)
37 |         self.down_proj = Linear(
38 |             self.hidden_size, bias=False, replica_grouping=self.replica_grouping)
39 | 
40 |     def build(self, x: popxl.Tensor) -> List[popxl.Tensor]:
41 |         """Identical input (x, seed) and identical output across shards."""
42 |         # ----- Sharded computation -----
43 | 
44 |         # Shard column-wise since gelu is not linear.
45 |         # Indeed, sharding row wise requires a sum AllReduce at the end,
46 |         # but swish is not linear: swish(x+y) != swish(x) + swish(y)
47 |         up = self.up_proj(x)
48 | 
49 |         gp = self.gate_proj(x)
50 |         gp_act = ops.swish(gp)
51 |         # Here, x is already sharded across devices. Since we don't have non linearities,
52 |         # we can shard row-wise (which requires both X and the weight matrix to be sharded)
53 |         # and then perform an all reduce
54 |         z = gp_act * up
55 | 
56 |         z = self.down_proj(z)
57 |         z = replicated_all_reduce(z, group=self.replica_grouping.transpose())
58 |         return z
59 | 
60 |     @staticmethod
61 |     def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
62 |         dtype = config.model.dtype
63 |         n_shards = config.execution.tensor_parallel
64 | 
65 |         return {
66 |             variables.gate_proj.weight: shard(
67 |                 to_numpy(hf_model.gate_proj.weight.data.T, dtype), n_shards, axis=-1
68 |             ),
69 |             variables.up_proj.weight: shard(
70 |                 to_numpy(hf_model.up_proj.weight.data.T, dtype), n_shards, axis=-1
71 |             ),
72 |             variables.down_proj.weight: shard(
73 |                 to_numpy(hf_model.down_proj.weight.data.T, dtype), n_shards, axis=0
74 |             ),
75 |         }
76 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/hf_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from typing import Dict
 4 | import numpy as np
 5 | 
 6 | from transformers.models.llama import LlamaModel as HFModel
 7 | from transformers.models.llama import LlamaForCausalLM as HFLMHeadModel
 8 | 
 9 | import popxl
10 | from popxl_addons import TaskSession
11 | 
12 | from config import LlamaConfig
13 | from modelling.llama_model import LlamaModelTP
14 | from modelling.llama_lm import LlamaLMHeadModelTP
15 | 
16 | 
17 | def hf_mapping_lm_tp(
18 |     config: LlamaConfig, session: TaskSession, pretrained: HFLMHeadModel
19 | ) -> Dict[popxl.Tensor, np.ndarray]:
20 |     load_to = session.state
21 |     if "fwd" in session.state:
22 |         load_to = session.state.fwd
23 |     weights = LlamaLMHeadModelTP.hf_mapping(config, load_to, pretrained)
24 |     return weights
25 | 
26 | 
27 | def hf_mapping_TP(config: LlamaConfig, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
28 |     load_to = session.state
29 |     if "fwd" in session.state:
30 |         load_to = session.state.fwd
31 |     weights = LlamaModelTP.hf_mapping(config, load_to, pretrained)
32 |     return weights
33 | 
34 | 
35 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel:
36 |     weights = session.get_named_tensors_data()
37 |     if "fwd" in weights:
38 |         weights = weights.fwd
39 |     state_dict = LlamaLMHeadModelTP.to_hf(weights, hf_model)
40 |     # check only missing keys are mask-related keys
41 |     hf_state_keys = hf_model.state_dict().keys()
42 |     popxl_keys = state_dict.keys()
43 | 
44 |     def should_check(k: str):
45 |         return "attn.bias" not in k and "attn.masked_bias" not in k
46 | 
47 |     for k in hf_state_keys:
48 |         if should_check(k) and k not in popxl_keys:
49 |             raise KeyError(f"key {k} not found in session state")
50 | 
51 |     hf_model.load_state_dict(state_dict, strict=False)
52 |     return hf_model
53 | 
54 | 
55 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel:
56 |     weights = session.get_named_tensors_data()
57 |     if "fwd" in weights:
58 |         weights = weights.fwd
59 | 
60 |     state_dict = LlamaModelTP.to_hf(weights, hf_model)
61 |     # check only missing keys are mask-related keys
62 |     hf_state_keys = hf_model.state_dict().keys()
63 |     popxl_keys = state_dict.keys()
64 | 
65 |     def should_check(k: str):
66 |         return "attn.bias" not in k and "attn.masked_bias" not in k
67 | 
68 |     for k in hf_state_keys:
69 |         if should_check(k) and k not in popxl_keys:
70 |             raise KeyError(f"key {k} not found in session state")
71 | 
72 |     hf_model.load_state_dict(state_dict, strict=False)
73 |     return hf_model
74 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/llama_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | from config import LlamaConfig
 5 | 
 6 | import popxl
 7 | from popxl.utils import to_numpy
 8 | 
 9 | import popxl_addons as addons
10 | from popxl_addons import NamedTensors
11 | 
12 | # from popxl_addons.layers import LayerNorm
13 | from .rms_norm import LlamaRMSNorm
14 | 
15 | from .embedding import LlamaEmbeddingsTP
16 | from .decoder import LlamaDecoderTP, LlamaDecoderBlockTP
17 | 
18 | from transformers.models.llama.modeling_llama import LlamaModel as HFModel
19 | 
20 | 
21 | class LlamaModelTP(addons.Module):
22 |     def __init__(self, config: LlamaConfig, include_layer_norm=True):
23 |         super().__init__()
24 |         self.config = config
25 |         # sharded, then last bit identical
26 |         self.embeddings = LlamaEmbeddingsTP(self.config)
27 |         # identical inputs, then sharded, then identical
28 |         self.decoder = LlamaDecoderTP(self.config)
29 |         # identical
30 |         self.include_layer_norm = include_layer_norm
31 |         if self.include_layer_norm:
32 |             self.ln_f = LlamaRMSNorm(self.config)
33 | 
34 |     def build(self, input_ids: popxl.Tensor):
35 |         x = self.embeddings(input_ids)
36 |         x = self.decoder(x)
37 |         if self.include_layer_norm:
38 |             x = self.ln_f(x)
39 |         return x
40 | 
41 |     @staticmethod
42 |     def hf_mapping(
43 |         config: LlamaConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True
44 |     ) -> Dict[popxl.Tensor, np.ndarray]:
45 |         dtype = config.model.dtype
46 |         
47 |         weights = {}
48 |         if layer_norm:
49 |             weights = {
50 |                 variables.ln_f.weight: to_numpy(hf_model.norm.weight.data, dtype),
51 |             }
52 | 
53 |         weights.update(LlamaEmbeddingsTP.hf_mapping(config, variables.embeddings, hf_model))
54 | 
55 |         for l in range(config.model.layers):
56 |             weights.update(LlamaDecoderBlockTP.hf_mapping(config, variables.decoder[l], hf_model.layers[l]))
57 | 
58 |         return weights
59 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from functools import partial
 3 | from typing import Dict
 4 | import torch
 5 | import popxl
 6 | from popxl import ops
 7 | from popxl.utils import to_numpy
 8 | from transformers.models.llama.modeling_llama import LlamaRMSNorm as HFModel
 9 | 
10 | import popxl_addons as addons
11 | from config import LlamaConfig
12 | from popxl_addons.named_tensors import NamedTensorData
13 | import numpy as np
14 | 
15 | from popxl_addons.named_tensors import NamedTensors
16 | 
17 | 
18 | class LlamaRMSNorm(addons.Module):
19 |     def __init__(self, config: LlamaConfig):
20 |         super().__init__()
21 |         self.eps = config.model.eps
22 | 
23 |     def build(self, x: popxl.Tensor) -> popxl.Tensor:
24 |         """
25 |         Build RMS layer normalisation for Llama. No bias and no subtraction of mean. This is equivalent to T5LayerNorm.
26 |         """
27 |         w = self.add_variable_input("weight", partial(np.ones, x.shape[-1]), x.dtype)
28 | 
29 |         # Perform the computation in float32
30 |         if x.dtype == popxl.float16:
31 |             x = ops.cast(x, popxl.float32)
32 |         
33 |         variance = ops.mean(x * x, -1, keepdims=True)
34 | 
35 |         x = x / ops.sqrt(variance + self.eps)
36 | 
37 |         # Cast back down to float16 if needed
38 |         if w.dtype == popxl.float16:
39 |             x = ops.cast(x, popxl.float16)
40 | 
41 |         x = x * w
42 |         return x
43 | 
44 |     @staticmethod
45 |     def hf_mapping(config: LlamaConfig, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
46 |         dtype = config.model.dtype
47 |         weights = {
48 |             variables.weight: to_numpy(hf_model.weight.data, dtype),
49 |         }
50 |         return weights


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/.rendered.rotary_pos_embed_binding.cpp:
--------------------------------------------------------------------------------
 1 | // cppimport
 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading
 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 4 | 
 5 | #include <map>
 6 | #include <memory>
 7 | #include <popart/alias/aliasmodel.hpp>
 8 | #include <popart/basicoptionals.hpp>
 9 | #include <popart/error.hpp>
10 | #include <popart/graph.hpp>
11 | #include <popart/ir.hpp>
12 | #include <popart/op.hpp>
13 | #include <popart/opmanager.hpp>
14 | #include <popart/opserialiser.hpp>
15 | #include <popart/popx/devicex.hpp>
16 | #include <popart/popx/irlowering.hpp>
17 | #include <popart/popx/opx.hpp>
18 | #include <popart/popx/opxmanager.hpp>
19 | #include <popart/region.hpp>
20 | #include <popart/tensor.hpp>
21 | #include <popart/util.hpp>
22 | #include <popart/vendored/optional.hpp>
23 | #include <poplar/Tensor.hpp>
24 | #include <vector>
25 | 
26 | #include <pybind11/numpy.h>
27 | #include <pybind11/operators.h>
28 | #include <pybind11/pybind11.h>
29 | #include <pybind11/stl.h>
30 | 
31 | #include "common.hpp"
32 | #include "rotary_pos_embed.hpp"
33 | #include "rotary_pos_embedx.hpp"
34 | 
35 | namespace py = pybind11;
36 | 
37 | // -------------- PyBind --------------
38 | // `rotary_pos_embed_binding` must equal filename
39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) {
40 |   // Bindings the parameters of the op: constructor + fields.
41 |   py::class_<popart::RotaryPosEmbedOp, popart::Op,
42 |              std::shared_ptr<popart::RotaryPosEmbedOp>>
43 |       binding(m, "RotaryPosEmbedOp");
44 |   binding.def_static(
45 |       "createOpInGraph",
46 |       py::overload_cast<popart::Graph &, const InMapType &, const OutMapType &,
47 |                         uint32_t, const popart::Op::Settings &>(
48 |           &popart::RotaryPosEmbedOp::createOpInGraph),
49 |       py::arg("graph"), py::arg("inputs"), py::arg("outputs"),
50 |       py::arg("rotaryDim"), py::arg("settings"),
51 |       py::return_value_policy::reference);
52 |   binding.def("outTensor",
53 |               py::overload_cast<OutIndex>(&popart::RotaryPosEmbedOp::outTensor),
54 |               py::return_value_policy::reference);
55 | };
56 | 
57 | // -------------- cppimport --------------
58 | // cppimport configuration for compiling the pybind11 module.
59 | // clang-format off
60 | /*
61 | 
62 | */
63 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .rotary_pos_embed import *
3 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/common.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_ROTARYPOSEMBED_OPIDS
 3 | #define GUARD_ROTARYPOSEMBED_OPIDS
 4 | 
 5 | #include <popart/attributes.hpp>
 6 | #include <popart/error.hpp>
 7 | #include <popart/names.hpp>
 8 | #include <popart/operatoridentifier.hpp>
 9 | 
10 | using InMapType = std::map<popart::InIndex, popart::TensorId>;
11 | using OutMapType = std::map<popart::OutIndex, popart::TensorId>;
12 | using OutIndex = int;
13 | 
14 | namespace popart {
15 | 
16 | #define CUSTOM_OP_DOMAIN "popxl.addons.ops"
17 | 
18 | const popart::OperatorIdentifier RotaryPosEmbed = OperatorIdentifier{
19 |     CUSTOM_OP_DOMAIN,
20 |     "RotaryPosEmbed",
21 |     1,      // Op version
22 |     {3, 3}, // number of inputs
23 |     1       // number of outputs
24 | };
25 | 
26 | const popart::OperatorIdentifier RotaryPosEmbedGrad = OperatorIdentifier{
27 |     CUSTOM_OP_DOMAIN,
28 |     "RotaryPosEmbedGrad",
29 |     1,      // Op version
30 |     {3, 3}, // number of inputs
31 |     1       // number of outputs
32 | };
33 | 
34 | } // namespace popart
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
  2 | #include <algorithm>
  3 | #include <cstdint>
  4 | #include <popart/graphcoreoperators.hpp>
  5 | #include <popart/names.hpp>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include <memory>
 10 | #include <popart/op.hpp>
 11 | #include <popart/opmanager.hpp>
 12 | #include <popart/opserialiser.hpp>
 13 | #include <popart/region.hpp>
 14 | #include <popart/tensor.hpp>
 15 | #include <popart/tensorindex.hpp>
 16 | #include <popart/util.hpp>
 17 | 
 18 | #include "common.hpp"
 19 | #include "rotary_pos_embed.hpp"
 20 | 
 21 | namespace popart {
 22 | 
 23 | /////////////////////////////////////////////////////////////
 24 | ////// Fwd op
 25 | 
 26 | RotaryPosEmbedOp::RotaryPosEmbedOp(const OperatorIdentifier &_opid,
 27 |                                    uint32_t rotary_dim_,
 28 |                                    const Op::Settings &settings_)
 29 |     : Op(_opid, settings_), rotary_dim{rotary_dim_} {
 30 |   if ((rotary_dim % 2) != 0) {
 31 |     throw error("RotaryPosEmbedOp::RotaryPosEmbedOp rotary_dim must be a "
 32 |                 "multiple of 2");
 33 |   }
 34 | }
 35 | 
 36 | std::unique_ptr<Op> RotaryPosEmbedOp::clone() const {
 37 |   return std::make_unique<RotaryPosEmbedOp>(*this);
 38 | }
 39 | 
 40 | std::vector<std::unique_ptr<Op>> RotaryPosEmbedOp::getGradOps() {
 41 |   std::vector<std::unique_ptr<Op>> result;
 42 |   result.push_back(std::make_unique<RotaryPosEmbedGradOp>(*this));
 43 |   return result;
 44 | }
 45 | 
 46 | void RotaryPosEmbedOp::setup() {
 47 |   auto xInfo = inInfo(0);
 48 |   auto cosInfo = inInfo(1);
 49 |   auto sinInfo = inInfo(2);
 50 | 
 51 |   // check expected shapes
 52 |   if (xInfo.rank() != 4) {
 53 |     throw error(
 54 |         "RotaryPosEmbedOp::setup x should have rank 4 (batch, heads, seq, hh)");
 55 |   }
 56 |   if (cosInfo.rank() != 3 || sinInfo.rank() != 3) {
 57 |     throw error("RotaryPosEmbedOp::setup trig functions should have rank 3 "
 58 |                 "(1 or batch, seq, hh/2)");
 59 |   }
 60 |   if ((rotary_dim % 2) != 0) {
 61 |     throw error("RotaryPosEmbedOp::setup rotary dim must be a multiple of 2");
 62 |   }
 63 | 
 64 |   // x rotated
 65 |   outInfo(0) = xInfo;
 66 | }
 67 | 
 68 | void RotaryPosEmbedOp::appendOutlineAttributes(OpSerialiserBase &os) const {
 69 |   os.appendAttribute("rotary_dim", rotary_dim);
 70 |   Op::appendOutlineAttributes(os);
 71 | }
 72 | 
 73 | /////////////////////////////////////////////////////////////
 74 | ////// Grad op
 75 | 
 76 | RotaryPosEmbedGradOp::RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op)
 77 |     : Op(RotaryPosEmbedGrad, op.getSettings()), rotary_dim{op.rotary_dim} {}
 78 | 
 79 | const std::map<int, int> &RotaryPosEmbedGradOp::gradOutToNonGradIn() const {
 80 |   static const std::map<int, int> outInfo = {{0, 0}};
 81 |   return outInfo;
 82 | }
 83 | 
 84 | const std::vector<GradInOutMapper> &
 85 | RotaryPosEmbedGradOp::gradInputInfo() const {
 86 |   static const std::vector<GradInOutMapper> inInfo = {
 87 |       {0, 0, GradOpInType::GradOut},
 88 |       {1, 1, GradOpInType::In},
 89 |       {2, 2, GradOpInType::In}};
 90 |   return inInfo;
 91 | }
 92 | 
 93 | void RotaryPosEmbedGradOp::setup() { outInfo(0) = inInfo(0); }
 94 | 
 95 | std::unique_ptr<Op> RotaryPosEmbedGradOp::clone() const {
 96 |   return std::make_unique<RotaryPosEmbedGradOp>(*this);
 97 | }
 98 | 
 99 | void RotaryPosEmbedGradOp::appendOutlineAttributes(OpSerialiserBase &os) const {
100 |   os.appendAttribute("rotary_dim", rotary_dim);
101 |   Op::appendOutlineAttributes(os);
102 | }
103 | 
104 | } // namespace popart
105 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_NEURALNET_STRIDEDSLICE_HPP
 3 | #define GUARD_NEURALNET_STRIDEDSLICE_HPP
 4 | 
 5 | #include <popart/graph.hpp>
 6 | #include <popart/op.hpp>
 7 | #include <popart/vendored/optional.hpp>
 8 | 
 9 | #include "common.hpp"
10 | 
11 | namespace popart {
12 | 
13 | class RotaryPosEmbedOp : public Op {
14 | public:
15 |   RotaryPosEmbedOp(const OperatorIdentifier &_opid, uint32_t rotary_dim_,
16 |                    const Op::Settings &settings_);
17 | 
18 |   std::unique_ptr<Op> clone() const override;
19 |   std::vector<std::unique_ptr<Op>> getGradOps() override;
20 |   void setup() final;
21 | 
22 |   float getSubgraphValue() const override { return getHighSubgraphValue(); }
23 | 
24 |   static RotaryPosEmbedOp *
25 |   createOpInGraph(popart::Graph &graph, const InMapType &in,
26 |                   const OutMapType &out, uint32_t rotary_dim_,
27 |                   const popart::Op::Settings &settings) {
28 |     return graph.createConnectedOp<RotaryPosEmbedOp>(in, out, RotaryPosEmbed,
29 |                                                      rotary_dim_, settings);
30 |   }
31 | 
32 |   void appendOutlineAttributes(OpSerialiserBase &) const override;
33 | 
34 |   uint32_t rotary_dim = 0;
35 | };
36 | 
37 | class RotaryPosEmbedGradOp : public Op {
38 | public:
39 |   RotaryPosEmbedGradOp(const RotaryPosEmbedOp &op);
40 | 
41 |   void setup() final;
42 |   std::unique_ptr<Op> clone() const override;
43 |   const std::vector<GradInOutMapper> &gradInputInfo() const final;
44 |   const std::map<int, int> &gradOutToNonGradIn() const final;
45 | 
46 |   float getSubgraphValue() const override { return getHighSubgraphValue(); }
47 | 
48 |   void appendOutlineAttributes(OpSerialiserBase &) const override;
49 | 
50 |   uint32_t rotary_dim = 0;
51 | };
52 | 
53 | } // namespace popart
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpp:
--------------------------------------------------------------------------------
 1 | // cppimport
 2 | // NOTE: the cppimport comment is necessary for dynamic compilation when loading
 3 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 4 | 
 5 | #include <map>
 6 | #include <memory>
 7 | #include <popart/alias/aliasmodel.hpp>
 8 | #include <popart/basicoptionals.hpp>
 9 | #include <popart/error.hpp>
10 | #include <popart/graph.hpp>
11 | #include <popart/ir.hpp>
12 | #include <popart/op.hpp>
13 | #include <popart/opmanager.hpp>
14 | #include <popart/opserialiser.hpp>
15 | #include <popart/popx/devicex.hpp>
16 | #include <popart/popx/irlowering.hpp>
17 | #include <popart/popx/opx.hpp>
18 | #include <popart/popx/opxmanager.hpp>
19 | #include <popart/region.hpp>
20 | #include <popart/tensor.hpp>
21 | #include <popart/util.hpp>
22 | #include <popart/vendored/optional.hpp>
23 | #include <poplar/Tensor.hpp>
24 | #include <vector>
25 | 
26 | #include <pybind11/numpy.h>
27 | #include <pybind11/operators.h>
28 | #include <pybind11/pybind11.h>
29 | #include <pybind11/stl.h>
30 | 
31 | #include "common.hpp"
32 | #include "rotary_pos_embed.hpp"
33 | #include "rotary_pos_embedx.hpp"
34 | 
35 | namespace py = pybind11;
36 | 
37 | // -------------- PyBind --------------
38 | // `rotary_pos_embed_binding` must equal filename
39 | PYBIND11_MODULE(rotary_pos_embed_binding, m) {
40 |   // Bindings the parameters of the op: constructor + fields.
41 |   py::class_<popart::RotaryPosEmbedOp, popart::Op,
42 |              std::shared_ptr<popart::RotaryPosEmbedOp>>
43 |       binding(m, "RotaryPosEmbedOp");
44 |   binding.def_static(
45 |       "createOpInGraph",
46 |       py::overload_cast<popart::Graph &, const InMapType &, const OutMapType &,
47 |                         uint32_t, const popart::Op::Settings &>(
48 |           &popart::RotaryPosEmbedOp::createOpInGraph),
49 |       py::arg("graph"), py::arg("inputs"), py::arg("outputs"),
50 |       py::arg("rotaryDim"), py::arg("settings"),
51 |       py::return_value_policy::reference);
52 |   binding.def("outTensor",
53 |               py::overload_cast<OutIndex>(&popart::RotaryPosEmbedOp::outTensor),
54 |               py::return_value_policy::reference);
55 | };
56 | 
57 | // -------------- cppimport --------------
58 | // cppimport configuration for compiling the pybind11 module.
59 | // clang-format off
60 | /*
61 | <%
62 | cfg['sources'] = ['rotary_pos_embed.cpp', 'rotary_pos_embedx.cpp']
63 | cfg['extra_compile_args'] = ['-std=c++14', '-fPIC', '-O2', '-DONNX_NAMESPACE=onnx', '-Wall', '-Wno-sign-compare']
64 | cfg['libraries'] = ['popart', 'poputil', 'popops', 'poplin', 'popnn', 'poprand', 'gcl']
65 | setup_pybind11(cfg)
66 | %>
67 | */
68 | 


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embed_binding.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/llama2-chatbot/modelling/rotary_pos_embed/rotary_pos_embedx.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2 | #ifndef GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP
 3 | #define GUARD_NEURALNET_ROTARYPOSEMBEDX_HPP
 4 | 
 5 | #include <popart/names.hpp>
 6 | #include <popart/popx/opx.hpp>
 7 | #include <vector>
 8 | 
 9 | namespace popart {
10 | namespace popx {
11 | 
12 | class RotaryPosEmbedOpx : public Opx {
13 | public:
14 |   RotaryPosEmbedOpx(Op *, Devicex *);
15 | 
16 |   void grow(poplar::program::Sequence &) const;
17 | };
18 | 
19 | class RotaryPosEmbedGradOpx : public Opx {
20 | public:
21 |   RotaryPosEmbedGradOpx(Op *, Devicex *);
22 | 
23 |   void grow(poplar::program::Sequence &) const;
24 | };
25 | 
26 | } // namespace popx
27 | } // namespace popart
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/llama2-chatbot/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | testpaths =
 3 |   tests
 4 | 
 5 | addopts =
 6 |   -r a
 7 |   -v
 8 | 
 9 | python_paths = . ../../../utils/
10 | 


--------------------------------------------------------------------------------
/llama2-chatbot/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.24.2
 2 | scipy==1.10.1
 3 | 
 4 | transformers==4.31.0
 5 | huggingface-hub==0.16.4
 6 | accelerate==0.20.3
 7 | sentencepiece==0.1.99
 8 | 
 9 | pytest==6.2.5
10 | pytest-pythonpath==0.7.4
11 | 
12 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0
13 | 
14 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
15 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@v3.3
16 | 
17 | -f https://download.pytorch.org/whl/torch_stable.html
18 | torch==2.0.1+cpu


--------------------------------------------------------------------------------
/llama2-chatbot/run-inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 3 | import logging
 4 | from typing import Optional, Tuple
 5 | 
 6 | from transformers import AutoTokenizer
 7 | from utils.setup import llama_config_setup
 8 | from config import LlamaConfig
 9 | from api import LlamaPipeline
10 | 
11 | def run_inference_popxl(config: LlamaConfig, tokenizer, hf_model, sequence_length: Optional[int] = None):
12 |     if sequence_length is not None:
13 |         config.model.sequence_length = sequence_length
14 | 
15 |     pipe = LlamaPipeline(config, hf_llama_checkpoint=hf_model, tokenizer=tokenizer)
16 | 
17 |     def get_input() -> Tuple[str, float, int, int]:
18 |         while True:
19 |             try:
20 |                 logging.info("-- Enter prompt --")
21 |                 prompt = input("> ")
22 |                 logging.info("-- Enter Sampling Temperature (0 for greedy) --")
23 |                 temperature = float(input("> "))
24 |                 logging.info("-- Enter top-k parameter (0 for max) --")
25 |                 k = int(input("> "))
26 |                 logging.info("-- Enter number of tokens to generate --")
27 |                 num_tokens = int(input("> "))
28 |                 break
29 |             except ValueError:
30 |                 logging.info("Invalid input!")
31 | 
32 |         return prompt, temperature, k, num_tokens
33 | 
34 |     while True:
35 |         prompt, temperature, k, output_length = get_input()
36 |         pipe(prompt, k=k, temperature=temperature, output_length=output_length)[0]
37 | 
38 | 
39 | def main():
40 |     # --- Setup ---
41 |     config, _, hf_model = llama_config_setup("config/inference.yml", "release", "llama2_7b_pod4", hf_model_setup=True)
42 |     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
43 | 
44 |     run_inference_popxl(config, tokenizer, hf_model=hf_model, sequence_length=2048)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     try:
49 |         main()
50 |     except Exception as e:
51 |         logging.exception(e)  # Log time of exception
52 |         raise
53 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import os
 3 | from config import LlamaConfig
 4 | 
 5 | import pytest
 6 | 
 7 | from utils.simple_parsing_tools import parse_args_with_config_file
 8 | 
 9 | 
10 | def _test_config_file():
11 |     return os.path.join(os.path.dirname(__file__), "test_config.yml")
12 | 
13 | 
14 | @pytest.fixture
15 | def test_config_file():
16 |     return _test_config_file()
17 | 
18 | 
19 | @pytest.fixture
20 | def test_config():
21 |     return parse_args_with_config_file(LlamaConfig, ["--config", _test_config_file()])
22 | 
23 | 
24 | # Below functions enable long tests to be skipped, unless a --long-test
25 | # cli option is specified.
26 | def pytest_addoption(parser):
27 |     parser.addoption("--long-tests", action="store_true", default=False, help="Run long tests")
28 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/execution/test_execution.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | from examples_tests.test_util import SubProcessChecker
 7 | 
 8 | root_dir = Path(__file__).parent.parent.parent.parent.resolve()
 9 | 
10 | 
11 | def dolly_root_env_path():
12 |     env = os.environ
13 |     env["PYTHONPATH"] = ":".join((*sys.path, str(root_dir)))
14 |     return env
15 | 
16 | 
17 | class TestExecution(SubProcessChecker):
18 |     def test_inference(self):
19 |         self.run_command(
20 |             "python3 inference.py --config tiny --layers 2 "
21 |             "--tensor_parallel 4 "
22 |             "--vocab_size 128 --sequence_length 16 "
23 |             "--hidden_size 128 --heads 8",
24 |             root_dir,
25 |             ["Duration"],
26 |             env=dolly_root_env_path(),
27 |         )
28 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/layers/test_attention_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.llama import LlamaConfig as HFConfig
 7 | from transformers.models.llama.modeling_llama import LlamaAttention
 8 | 
 9 | import popxl
10 | 
11 | import popxl_addons as addons
12 | from popxl_addons.patterns import apply_pre_alias_patterns
13 | from config import LlamaConfig
14 | from modelling.attention import LlamaSelfAttentionTP
15 | from popxl_addons.array_munging import repeat
16 | 
17 | 
18 | def test_attention_TP_cmp_huggingface(test_config: LlamaConfig):
19 |     torch.manual_seed(42)
20 | 
21 |     batch_size = test_config.execution.micro_batch_size
22 |     seq_len = test_config.model.sequence_length
23 |     hidden_size = test_config.model.hidden_size
24 |     intermediate_size = hidden_size * 4
25 | 
26 |     # HuggingFace
27 |     config = HFConfig(
28 |         hidden_size=hidden_size,
29 |         max_position_embeddings=seq_len,
30 |         intermediate_size=intermediate_size,
31 |         num_attention_heads=test_config.model.attention.heads,
32 |         rotary_dim=test_config.model.attention.rotary_dim,
33 |     )
34 |     hf_model = LlamaAttention(config).eval()
35 | 
36 |     # HF forward
37 |     input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True)
38 |     output_, *_ = hf_model(input_t, None)
39 |     output_HF = output_.detach().numpy()
40 | 
41 |     # TP
42 |     n_shards = test_config.execution.tensor_parallel
43 | 
44 |     # popxl
45 |     ir = popxl.Ir()
46 |     ir.replication_factor = n_shards
47 |     with ir.main_graph:
48 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
49 |             *[
50 |                 addons.host_load(
51 |                     input_t.reshape(-1, test_config.model.hidden_size), test_config.model.dtype, name="input"
52 |                 ),
53 |             ]
54 |         )
55 |         (x,) = inputs_tensors
56 | 
57 |         attn_args, attn_graph = LlamaSelfAttentionTP(test_config).create_graph(x)
58 | 
59 |         vars = attn_args.init()
60 |         fwd_info = attn_graph.bind(vars).call_with_info(x)
61 |         (acts,) = fwd_info.outputs
62 | 
63 |         fwd_d2h = addons.host_store(acts)
64 | 
65 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
66 |     apply_pre_alias_patterns(ir, level="default")
67 | 
68 |     weights = LlamaSelfAttentionTP.hf_mapping(test_config, vars, hf_model)
69 | 
70 |     inputs = {h2d: repeat(data, n_shards).squeeze() for h2d, data in zip(inputs_host_steam, inputs_data)}
71 | 
72 |     with popxl.Session(ir, "ipu_hw") as session:
73 |         session.write_variables_data(weights)
74 |         outputs_popxl = session.run(inputs)
75 | 
76 |     fwd_data = outputs_popxl[fwd_d2h]
77 | 
78 |     if n_shards > 1:
79 |         assert len(fwd_data) == n_shards
80 | 
81 |         # Assert all IPU outputs are identical
82 |         for i in range(1, n_shards):
83 |             np.testing.assert_equal(fwd_data[0], fwd_data[i])
84 |     else:
85 |         fwd_data = np.expand_dims(fwd_data, axis=0)
86 | 
87 |     # Assert nearly equal to HF
88 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 4)
89 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/layers/test_decoder_block_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig
 7 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 8 | 
 9 | import popxl
10 | 
11 | import popxl_addons as addons
12 | from popxl_addons.patterns import apply_pre_alias_patterns
13 | 
14 | from config import LlamaConfig
15 | from modelling.decoder import LlamaDecoderBlockTP
16 | from popxl_addons.array_munging import repeat
17 | 
18 | 
19 | def test_decoder_block_TP_cmp_huggingface(test_config: LlamaConfig):
20 |     torch.manual_seed(42)
21 | 
22 |     batch_size = test_config.execution.micro_batch_size
23 |     seq_len = test_config.model.sequence_length
24 |     hidden_size = test_config.model.hidden_size
25 |     intermediate_size = hidden_size * 4
26 | 
27 |     # HuggingFace
28 |     config = HFConfig(
29 |         hidden_size=hidden_size,
30 |         max_position_embeddings=seq_len,
31 |         intermediate_size=intermediate_size,
32 |         num_attention_heads=test_config.model.attention.heads,
33 |         rotary_dim=test_config.model.attention.rotary_dim,
34 |         use_parallel_residual=True,
35 |     )
36 |     hf_model = LlamaDecoderLayer(config).eval()
37 | 
38 |     # HF forward
39 |     input_t = torch.rand((batch_size, seq_len, hidden_size), requires_grad=True)
40 |     (output_,) = hf_model(input_t)
41 | 
42 |     output_HF = output_.detach().numpy()
43 | 
44 |     # TP
45 |     n_shards = test_config.execution.tensor_parallel
46 |     test_config.execution.tensor_parallel = n_shards
47 | 
48 |     # popxl
49 |     ir = popxl.Ir()
50 |     ir.replication_factor = n_shards
51 | 
52 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
53 | 
54 |     main = ir.main_graph
55 | 
56 |     with main:
57 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
58 |             *[
59 |                 addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"),
60 |             ]
61 |         )
62 |         (x,) = inputs_tensors
63 | 
64 |         args, graph = LlamaDecoderBlockTP(test_config).create_graph(x)
65 | 
66 |         ff_vars = args.init()
67 |         ff = graph.bind(ff_vars)
68 |         fwd_info = ff.call_with_info(x)
69 |         (acts,) = fwd_info.outputs
70 | 
71 |         fwd_d2h = addons.host_store(acts)
72 | 
73 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
74 |     apply_pre_alias_patterns(ir, level="default")
75 | 
76 |     weights = LlamaDecoderBlockTP.hf_mapping(test_config, ff_vars, hf_model)
77 | 
78 |     inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)}
79 | 
80 |     with popxl.Session(ir, "ipu_hw") as session:
81 |         session.write_variables_data(weights)
82 |         outputs_popxl = session.run(inputs)
83 | 
84 |     fwd_data = outputs_popxl[fwd_d2h]
85 | 
86 |     assert len(fwd_data) == n_shards
87 | 
88 |     # Assert all IPU outputs are identical
89 |     for i in range(1, n_shards):
90 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
91 |     # Assert nearly equal to HF
92 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
93 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/layers/test_feed_forward_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig
 7 | from transformers.models.llama.modeling_llama import LlamaMLP
 8 | 
 9 | import popxl
10 | 
11 | import popxl_addons as addons
12 | from popxl_addons.patterns import apply_pre_alias_patterns
13 | 
14 | from config import LlamaConfig
15 | from modelling.feed_forward import LlamaFeedForwardTP
16 | from popxl_addons.array_munging import repeat
17 | 
18 | 
19 | def test_feed_forward_TP_cmp_huggingface(test_config: LlamaConfig):
20 |     torch.manual_seed(42)
21 | 
22 |     batch_size = test_config.execution.micro_batch_size
23 |     seq_len = test_config.model.sequence_length
24 |     hidden_size = test_config.model.hidden_size
25 |     intermediate_size = hidden_size * 4
26 | 
27 |     # HuggingFace
28 |     config = HFConfig(
29 |         hidden_size=hidden_size,
30 |         max_position_embeddings=seq_len,
31 |         intermediate_size=intermediate_size,
32 |         num_attention_heads=test_config.model.attention.heads,
33 |     )
34 |     hf_model = LlamaMLP(config).eval()
35 | 
36 |     # HF forward
37 |     input_t = torch.rand((batch_size, seq_len, hidden_size))
38 |     outputs = hf_model(input_t)
39 |     output_ = outputs.reshape(batch_size * seq_len, hidden_size)
40 |     output_HF = output_.detach().numpy()
41 | 
42 |     # TP
43 |     n_shards = test_config.execution.tensor_parallel
44 | 
45 |     # popxl
46 |     ir = popxl.Ir()
47 |     ir.replication_factor = n_shards
48 | 
49 |     main = ir.main_graph
50 | 
51 |     with main:
52 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
53 |             *[
54 |                 addons.host_load(input_t.reshape(-1, test_config.model.hidden_size), popxl.float32, name="input"),
55 |             ]
56 |         )
57 |         (x,) = inputs_tensors
58 | 
59 |         ff_args, ff_graph = LlamaFeedForwardTP(test_config).create_graph(x)
60 | 
61 |         ff_vars = ff_args.init()
62 |         ff = ff_graph.bind(ff_vars)
63 |         fwd_info = ff.call_with_info(x)
64 |         (acts,) = fwd_info.outputs
65 | 
66 |         fwd_d2h = addons.host_store(acts)
67 | 
68 |     # Run `OpToIdentityPattern` among others part of `PreAliasPatterns`
69 |     apply_pre_alias_patterns(ir, level="default")
70 | 
71 |     weights = LlamaFeedForwardTP.hf_mapping(test_config, ff_vars, hf_model)
72 | 
73 |     inputs = {h2d: repeat(data, n_shards) for h2d, data in zip(inputs_host_steam, inputs_data)}
74 |     with popxl.Session(ir, "ipu_hw") as session:
75 |         session.write_variables_data(weights)
76 |         outputs_popxl = session.run(inputs)
77 | 
78 |     fwd_data = outputs_popxl[fwd_d2h]
79 | 
80 |     assert len(fwd_data) == n_shards
81 | 
82 |     # Assert all IPU outputs are identical
83 |     for i in range(1, n_shards):
84 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
85 |     # Assert nearly equal to HF
86 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
87 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/layers/test_lm_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | from transformers.models.llama import LlamaConfig as HFConfig
 7 | from transformers.models.llama import LlamaForCausalLM
 8 | 
 9 | import popxl
10 | from popxl.utils import to_numpy
11 | 
12 | import popxl_addons as addons
13 | from popxl_addons.patterns import apply_pre_alias_patterns
14 | 
15 | from config import LlamaConfig
16 | from modelling.embedding import LlamaEmbeddingsTP
17 | from modelling.llama_lm import LlamaLMHeadModelTP
18 | 
19 | from popxl_addons.array_munging import shard
20 | 
21 | 
22 | def test_lm_TP_cmp_huggingface(test_config: LlamaConfig):
23 |     torch.manual_seed(42)
24 |     batch_size = test_config.execution.micro_batch_size
25 |     hidden_size = test_config.model.hidden_size
26 |     intermediate_size = hidden_size * 4
27 |     seq_len = test_config.model.sequence_length
28 |     # HuggingFace
29 |     config = HFConfig(
30 |         num_hidden_layers=test_config.model.layers,
31 |         vocab_size=test_config.model.embedding.vocab_size,
32 |         hidden_size=hidden_size,
33 |         max_position_embeddings=seq_len,
34 |         intermediate_size=intermediate_size,
35 |         num_attention_heads=test_config.model.attention.heads,
36 |         rotary_dim=test_config.model.attention.rotary_dim,
37 |     )
38 |     hf_model = LlamaForCausalLM(config).eval()
39 | 
40 |     # HF forward
41 |     input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length))
42 |     output_HF = hf_model(input_ids=input_t)[0]
43 |     output_HF = output_HF.detach().numpy()
44 | 
45 |     # n_shards
46 |     n_shards = test_config.execution.tensor_parallel
47 | 
48 |     # Offset inputs
49 |     words_offsetted = LlamaEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t))
50 |     # popxl
51 |     ir = popxl.Ir()
52 |     ir.replication_factor = n_shards
53 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
54 |     main = ir.main_graph
55 | 
56 |     with main:
57 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
58 |             *[
59 |                 addons.host_load(words_offsetted[0], popxl.int32, name="words"),
60 |             ]
61 |         )
62 |         (words,) = inputs_tensors
63 |         facts, graph = LlamaLMHeadModelTP(test_config).create_graph(words)
64 |         vars = facts.init()
65 |         llm = graph.bind(vars)
66 |         call_info = llm.call_with_info(words)
67 |         act, *_ = call_info.outputs
68 |         act_stream = addons.host_store(act)
69 | 
70 |     apply_pre_alias_patterns(ir, level="default")
71 | 
72 |     # Map weights from huggingface
73 |     weights = LlamaLMHeadModelTP.hf_mapping(test_config, vars, hf_model)
74 | 
75 |     inputs = dict(zip(inputs_host_steam, [words_offsetted]))
76 | 
77 |     ir.num_host_transfers = test_config.execution.device_iterations
78 | 
79 |     with popxl.Session(ir, "ipu_hw") as session:
80 |         session.write_variables_data(weights)
81 |         outs = session.run(inputs)
82 | 
83 |     # Fwd output
84 |     fwd_data = outs[act_stream]
85 |     assert len(fwd_data) == n_shards
86 |     fwd_data_full = np.concatenate(fwd_data, axis=-1)[:, : test_config.model.embedding.vocab_size]
87 |     np.testing.assert_almost_equal(output_HF, fwd_data_full.reshape(output_HF.shape), 3)
88 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/integration/layers/test_model_TP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # HF
 6 | 
 7 | from transformers.models.llama.configuration_llama import LlamaConfig as HFConfig
 8 | from transformers.models.llama.modeling_llama import LlamaModel
 9 | 
10 | import popxl
11 | from popxl.utils import to_numpy
12 | 
13 | import popxl_addons as addons
14 | from popxl_addons.patterns import apply_pre_alias_patterns
15 | 
16 | from config import LlamaConfig
17 | from modelling.embedding import LlamaEmbeddingsTP
18 | from modelling.llama_model import LlamaModelTP
19 | 
20 | 
21 | def test_model_TP_cmp_huggingface(test_config: LlamaConfig):
22 |     torch.manual_seed(42)
23 | 
24 |     batch_size = test_config.execution.micro_batch_size
25 |     hidden_size = test_config.model.hidden_size
26 |     seq_len = test_config.model.sequence_length
27 |     intermediate_size = hidden_size * 4
28 |     # HuggingFace
29 |     config = HFConfig(
30 |         num_hidden_layers=test_config.model.layers,
31 |         vocab_size=test_config.model.embedding.vocab_size,
32 |         hidden_size=hidden_size,
33 |         max_position_embeddings=seq_len,
34 |         intermediate_size=intermediate_size,
35 |         num_attention_heads=test_config.model.attention.heads,
36 |         rotary_dim=test_config.model.attention.rotary_dim,
37 |     )
38 |     hf_model = LlamaModel(config).eval()
39 | 
40 |     # HF forward
41 |     input_t = torch.randint(0, test_config.model.embedding.vocab_size, (batch_size, test_config.model.sequence_length))
42 |     output_HF = hf_model(input_ids=input_t)[0]
43 |     output_HF = output_HF.detach().numpy()
44 | 
45 |     # TP
46 |     tp = test_config.execution.tensor_parallel
47 | 
48 |     # Offset inputs
49 |     words_offsetted = LlamaEmbeddingsTP.offset_inputs(test_config, to_numpy(input_t))
50 | 
51 |     # popxl
52 |     ir = popxl.Ir()
53 |     ir.replication_factor = tp
54 |     replica_grouping = ir.replica_grouping(stride=1, group_size=1)
55 |     main = ir.main_graph
56 | 
57 |     with main:
58 |         inputs_data, inputs_host_steam, inputs_tensors = zip(
59 |             *[
60 |                 addons.host_load(words_offsetted[0], popxl.int32, name="words"),
61 |             ]
62 |         )
63 |         (words,) = inputs_tensors
64 |         facts, graph = LlamaModelTP(test_config).create_graph(words)
65 | 
66 |         vars = facts.init()
67 |         llm = graph.bind(vars)
68 |         call_info = llm.call_with_info(words)
69 |         act, *_ = call_info.outputs
70 |         act_stream = addons.host_store(act)
71 | 
72 |     apply_pre_alias_patterns(ir, level="default")
73 | 
74 |     # Map weights from huggingface
75 |     weights = LlamaModelTP.hf_mapping(test_config, vars, hf_model)
76 | 
77 |     inputs = dict(zip(inputs_host_steam, [words_offsetted]))
78 | 
79 |     ir.num_host_transfers = test_config.execution.device_iterations
80 | 
81 |     with popxl.Session(ir, "ipu_hw") as session:
82 |         session.write_variables_data(weights)
83 |         outs = session.run(inputs)
84 | 
85 |     # Fwd output
86 |     fwd_data = outs[act_stream]
87 | 
88 |     assert len(fwd_data) == tp
89 |     for i in range(1, tp):
90 |         np.testing.assert_equal(fwd_data[0], fwd_data[i])
91 | 
92 |     np.testing.assert_almost_equal(output_HF, fwd_data[0].reshape(output_HF.shape), 3)
93 | 


--------------------------------------------------------------------------------
/llama2-chatbot/tests/test_config.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   sequence_length: 256 # 8
 3 |   embedding:
 4 |     vocab_size: 128
 5 |   hidden_size: 128
 6 |   layers: 2
 7 |   attention:
 8 |     heads: 4
 9 |     rotary_dim: 8
10 |   precision: "float32"
11 | execution:
12 |   micro_batch_size: 1
13 |   data_parallel: 1
14 |   tensor_parallel: 4
15 | 


--------------------------------------------------------------------------------
/llama2-chatbot/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/llama2-chatbot/utils/simple_parsing_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from examples_utils.parsing.simple_parsing_tools import *
4 | 


--------------------------------------------------------------------------------
/molfeat/requirements.txt:
--------------------------------------------------------------------------------
 1 | pydantic<2 
 2 | molfeat==0.8.8
 3 | transformers
 4 | rdkit==2023.3.1
 5 | stmol
 6 | seaborn==0.12.2
 7 | ipywidgets==8.0.6
 8 | matplotlib
 9 | numpy
10 | tabulate==0.9.0
11 | py3Dmol==2.0.1.post1
12 | torchinfo==1.7.2
13 | 


--------------------------------------------------------------------------------
/molfeat/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from typing import Optional
  4 | import seaborn as sns
  5 | 
  6 | from stmol import showmol
  7 | import py3Dmol
  8 | 
  9 | from rdkit import Chem
 10 | from rdkit.Chem import AllChem
 11 | import tabulate
 12 | 
 13 | 
 14 | class Emoji:
 15 |     microscope = '\U0001F52C'
 16 |     test_tube = '\U0001F9EA'
 17 |     yes = '\u2705'
 18 |     no = '\u274C'
 19 |     ruler = '\U0001F4CF'
 20 |     warning = '\u26A0'
 21 |     rocket = '\U0001F680'
 22 | 
 23 | 
 24 | def make_block(smi: str) -> str:
 25 |     mol = Chem.MolFromSmiles(smi)
 26 |     mol = Chem.AddHs(mol)
 27 |     AllChem.EmbedMolecule(mol)
 28 |     mblock = Chem.MolToMolBlock(mol)
 29 |     return mblock
 30 | 
 31 | 
 32 | def render_mol(xyz: str):
 33 |     xyzview = py3Dmol.view()
 34 |     xyzview.addModel(xyz, 'mol')
 35 |     xyzview.setStyle({'stick': {}})
 36 |     xyzview.setBackgroundColor('white')
 37 |     xyzview.zoomTo()
 38 |     showmol(xyzview, height=500, width=500)
 39 |     return xyzview
 40 | 
 41 | 
 42 | def plot_3d_mol(smile: str):
 43 |     blk = make_block(smile)
 44 |     view = render_mol(blk)
 45 |     return view
 46 | 
 47 | 
 48 | def report_molecule_classification(name: str, y_truth: bool, out: Optional[float], smile: str):
 49 |     table = [
 50 |         ["Molecule:", name],
 51 |         ["BBBP:", f"{y_truth} (target) {Emoji.microscope}"],
 52 |     ]
 53 |     if out is not None:
 54 |         table.append(["Prediction:", f"{bool(out > 0.5)} {Emoji.test_tube}"])
 55 |         table.append(["Correct:", f"{Emoji.yes}" if bool(out > 0.5) == y_truth else f"{Emoji.no}"])
 56 |     print(tabulate.tabulate(table, tablefmt="heavy_grid"))
 57 | 
 58 |     return plot_3d_mol(smile)
 59 | 
 60 | 
 61 | def report_molecule_regression(name: str, y_truth: float, out: Optional[float], smile: str, mask=None):
 62 |     table = [
 63 |         ["Molecule:", name],
 64 |         ["exp:", f"{y_truth:.4f} (target) {Emoji.microscope}"],
 65 |     ]
 66 |     if out is not None:
 67 |         err = abs(y_truth - out)
 68 |         table.append(["Prediction:", f"{out:.4f} {Emoji.test_tube}"])
 69 |         table.append(["|err|:", f"{err:.4f} " + (f"{Emoji.ruler}" if err < 1.5 else f"{Emoji.warning}")])
 70 |     print(tabulate.tabulate(table, tablefmt="heavy_grid"))
 71 |     return plot_3d_mol(smile)
 72 | 
 73 | 
 74 | def plot_smoothed_loss(epoch_losses: np.ndarray, window_size: int = 10):
 75 |     moving_avg = np.convolve(epoch_losses, np.ones(window_size) / window_size, mode='valid')
 76 |     moving_avg = np.clip(moving_avg, 0, None)
 77 | 
 78 |     q1, q3 = np.percentile(epoch_losses, [25, 75])
 79 |     iqr = q3 - q1
 80 | 
 81 |     fig, ax = plt.subplots(figsize=(10, 5))
 82 |     ax.plot(moving_avg, color='#FF6F79')
 83 |     ax.fill_between(
 84 |         range(len(moving_avg)), np.clip(moving_avg - iqr, 0, None), moving_avg + iqr, alpha=0.3, color='#FF6F79'
 85 |     )
 86 | 
 87 |     ax.set_title('Smoothed Loss with IQR')
 88 |     ax.set_xlabel('Steps')
 89 |     ax.set_ylabel('Loss')
 90 | 
 91 |     plt.show()
 92 | 
 93 | 
 94 | def plot_contours(test_y_true: np.ndarray, test_y_hat: np.ndarray, r2: float, mae: float):
 95 |     plt.style.use('seaborn')
 96 | 
 97 |     hist, xedges, yedges = np.histogram2d(test_y_true, test_y_hat, bins=10)
 98 |     X, Y = np.meshgrid(xedges[:-1], yedges[:-1])
 99 |     Z = hist.T
100 |     plt.contour(X, Y, Z, colors=None, levels=5, linewidths=1.5, alpha=0.7, cmap='viridis')
101 | 
102 |     plt.scatter(test_y_true, test_y_hat, alpha=0.7, edgecolors='k', linewidths=0.5)
103 | 
104 |     plt.gca().annotate(
105 |         "$R2 = {:.2f}$\n MAE = {:.2f}".format(r2, mae),
106 |         xy=(0.05, 0.9),
107 |         xycoords='axes fraction',
108 |         size=10,
109 |         bbox=dict(boxstyle="round", fc=(1.0, 0.7, 0.7), ec="none"),
110 |     )
111 | 
112 |     plt.xlabel("y true")
113 |     plt.ylabel("y pred")
114 | 
115 |     plt.show()
116 | 


--------------------------------------------------------------------------------
/multimodal/magma/configs/MAGMA_v1.yml:
--------------------------------------------------------------------------------
 1 | {
 2 |     # image encoder settings
 3 |     encoder_name: 'clip_resnet_large',
 4 |     adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 4}},
 5 |     freeze_img_encoder: false,
 6 | 
 7 |     # train settings
 8 |     batch_size: 256,
 9 |     train_steps: 150000,
10 |     lr: 8.0e-4,
11 |     min_lr: 0.0,
12 |     lr_decay_iters: 300000,
13 |     image_enc_lr: 2.0e-6,
14 |     use_image_embed_layernorm: true,
15 |     image_embed_dropout_prob: 0.1,
16 |     image_size: 384,
17 | 
18 |     gradient_accumulation_steps: 8,
19 |     zero_stage: 2,
20 |     gradient_clipping: 1.0,
21 | 
22 |     # dataset / save / load settings
23 |     train_dataset_name: 'conceptual_captions',
24 |     train_dataset_dir: '/mnt/localdisk/conceptual_captions',
25 |     eval_dataset_name: 'coco',
26 |     eval_dataset_dir: '/mnt/localdisk/coco_data',
27 | 
28 |     save: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16",
29 |     load: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16",
30 | 
31 |     eval_every: 100,
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/multimodal/magma/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from .config import *
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | CONFIG_DIR = Path(os.path.dirname(__file__))
 8 | 
 9 | del os, Path
10 | 


--------------------------------------------------------------------------------
/multimodal/magma/configs/inference.yml:
--------------------------------------------------------------------------------
 1 | # -------- LM --------
 2 | gptj_6B: &gptj_6B
 3 |   layers: 28
 4 |   hidden_size: 4096
 5 |   sequence_length: 1024
 6 |   precision: 'float16'
 7 |   attention:
 8 |     heads: 16
 9 |     rotary_positional_embeddings_base: 10000
10 |     rotary_dim: 64
11 |   embedding:
12 |     vocab_size: 50400
13 |     real_vocab_size: 50258
14 | 
15 | gptj_tiny: &gptj_tiny
16 |   sequence_length: 8
17 |   embedding:
18 |     vocab_size: 128
19 |   hidden_size: 64
20 |   layers: 2
21 |   attention:
22 |     heads: 8
23 |     rotary_dim: 8
24 |   precision: "float16"
25 | 
26 | # -------- VISUAL ENCODER --------
27 | clip_resnet_large: &clip_resnet_large
28 |   width: 96
29 |   image_resolution: 384
30 |   precision: 'float16'
31 | 
32 | clip_tiny: &clip_tiny
33 |   width: 36 # default 96 -> 1/8 default
34 |   image_resolution: 48 # default 384 -> 1/8 default
35 |   precision: 'float16'
36 | 
37 | # -------- MAGMA --------
38 | tiny:
39 |   magma_v1:
40 |     visual:
41 |       <<: *clip_tiny
42 |       execution:
43 |         micro_batch_size: 1
44 |         available_memory_proportion: [ 1.0 ]
45 |     transformer:
46 |       <<: *gptj_tiny
47 |       ff_adapter:
48 |         mode: 'normal'
49 |         downsample_factor: 4
50 |       execution:
51 |         micro_batch_size: 1
52 |         attention_serialisation: 1
53 |         tensor_parallel: 2
54 | 
55 | release:
56 |   "magma_v1_1024":
57 |     seed: 0
58 |     visual:
59 |       <<: *clip_resnet_large
60 |       execution:
61 |         micro_batch_size: 1
62 |         available_memory_proportion: [ 1.0 ]
63 |     transformer:
64 |       <<: *gptj_6B
65 |       ff_adapter:
66 |         mode: 'normal'
67 |         downsample_factor: 4
68 |       execution:
69 |         available_memory_proportion: [ 0.45 ]
70 |         tensor_parallel: 4
71 |         micro_batch_size: 1
72 |         attention_serialisation: 1
73 | 
74 |   "magma_v1_500":
75 |     seed: 0
76 |     visual:
77 |       <<: *clip_resnet_large
78 |     transformer:
79 |       <<: *gptj_6B
80 |       sequence_length: 500
81 |       execution:
82 |         available_memory_proportion: [ 0.45 ]
83 |         tensor_parallel: 4
84 |         micro_batch_size: 1
85 |         attention_serialisation: 1
86 |       ff_adapter:
87 |         mode: 'normal'
88 |         downsample_factor: 4
89 | 


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/cantaloupe_popsicle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/cantaloupe_popsicle.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/circles.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/circles.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/circles_square.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/circles_square.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/korea.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/korea.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/matterhorn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/matterhorn.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/mushroom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/mushroom.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/people.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/people.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/playarea.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/playarea.jpg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/popsicle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/popsicle.png


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/rainbow_popsicle.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/rainbow_popsicle.jpeg


--------------------------------------------------------------------------------
/multimodal/magma/demo_example_images/table_tennis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/demo_example_images/table_tennis.jpg


--------------------------------------------------------------------------------
/multimodal/magma/images/MagmaStructure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/images/MagmaStructure.png


--------------------------------------------------------------------------------
/multimodal/magma/images/demo_magma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/multimodal/magma/images/demo_magma.png


--------------------------------------------------------------------------------
/multimodal/magma/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from .clip_resnet import *
4 | from .gptj import *
5 | from .adapters_TP import *
6 | from .image_prefix import *
7 | from .magma_mapping import *
8 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/clip_resnet/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from .batch_norm import *
4 | from .bottleneck import *
5 | from .stem import *
6 | from .modified_resnet import *
7 | from .attention_pool import *
8 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/clip_resnet/batch_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | import numpy as np
 4 | from functools import partial
 5 | 
 6 | import popxl
 7 | import popxl.ops as ops
 8 | import popxl_addons as addons
 9 | 
10 | __all__ = ["BatchNorm2D"]
11 | 
12 | 
13 | class BatchNorm2D(addons.Module):
14 |     def __init__(self, epsilon: float = 1e-5, momentum: float = 0.9):
15 |         """
16 |         Implements Batch Normalization (only for inference)
17 |         """
18 |         super().__init__()
19 |         self.epsilon = epsilon
20 |         self.momentum = momentum  # Not used in inference; the default used is consistent with ONNX
21 | 
22 |     def build(self, x: popxl.Tensor) -> popxl.Tensor:
23 | 
24 |         shape = (x.shape[1],)
25 | 
26 |         self.weight = self.add_variable_input(
27 |             "weight",
28 |             partial(np.ones, shape),
29 |             x.dtype,
30 |         )
31 | 
32 |         self.bias = self.add_variable_input(
33 |             "bias",
34 |             partial(np.zeros, shape),
35 |             x.dtype,
36 |         )
37 | 
38 |         self.running_mean = self.add_variable_input(
39 |             "running_mean",
40 |             partial(np.zeros, shape),
41 |             x.dtype,
42 |         )
43 | 
44 |         self.running_var = self.add_variable_input(
45 |             "running_var",
46 |             partial(np.ones, shape),
47 |             x.dtype,
48 |         )
49 | 
50 |         y = ops.batch_norm_inference(
51 |             x,
52 |             scale=self.weight,
53 |             bias=self.bias,
54 |             mean=self.running_mean,
55 |             var=self.running_var,
56 |             epsilon=self.epsilon,
57 |             momentum=self.momentum,
58 |         )
59 | 
60 |         return y
61 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/clip_resnet/stem.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | import numpy as np
 4 | 
 5 | from popxl_addons import Module, NamedTensors
 6 | from popxl_addons.layers import Conv2D
 7 | import popxl
 8 | from popxl import Tensor
 9 | from popxl import ops
10 | from popxl.utils import to_numpy
11 | from clip.model import ModifiedResNet as ClipModifiedResNet
12 | from configs import ResNetConfig
13 | from .batch_norm import BatchNorm2D
14 | 
15 | __all__ = ["Stem"]
16 | 
17 | 
18 | class Stem(Module):
19 |     def __init__(self, config: ResNetConfig):
20 |         """
21 |         Stem block of CLIP ModifiedResNet.
22 |         Inference only: batch normalisation layers can work with a baked running mean and running vars,
23 |         but these values won't be updated.
24 |         """
25 |         super().__init__()
26 |         self.config = config
27 |         self.conv1 = Conv2D(self.config.width // 2, kernel_size=3, strides=(2, 2), paddings=(1, 1, 1, 1), bias=False)
28 |         self.bn1 = BatchNorm2D()
29 |         self.conv2 = Conv2D(self.config.width // 2, kernel_size=3, paddings=(1, 1, 1, 1), bias=False)
30 |         self.bn2 = BatchNorm2D()
31 |         self.conv3 = Conv2D(self.config.width, kernel_size=3, paddings=(1, 1, 1, 1), bias=False)
32 |         self.bn3 = BatchNorm2D()
33 | 
34 |     def build(self, x: popxl.Tensor) -> popxl.Tensor:
35 |         x = self.conv1(x)
36 |         x = self.bn1(x)
37 |         x = self.conv2(ops.relu(x))
38 |         x = self.bn2(x)
39 |         x = self.conv3(ops.relu(x))
40 |         x = self.bn3(x)
41 |         # NOTE: average pool in pytorch has stride default value = kernel size.
42 |         # this is different in popxl so we need to set all parameters
43 |         x = ops.average_pool(ops.relu(x), kernel_size=(2, 2), stride=(2, 2))
44 |         return x
45 | 
46 |     @staticmethod
47 |     def clip_mapping(clip_model: ClipModifiedResNet, variables: NamedTensors):
48 |         state_dict = {
49 |             variables.conv1.weight: to_numpy(clip_model.conv1.weight.data),
50 |             variables.bn1.weight: to_numpy(clip_model.bn1.weight.data),
51 |             variables.bn1.bias: to_numpy(clip_model.bn1.bias.data),
52 |             variables.bn1.running_mean: to_numpy(clip_model.bn1.running_mean.data),
53 |             variables.bn1.running_var: to_numpy(clip_model.bn1.running_var.data),
54 |             variables.conv2.weight: to_numpy(clip_model.conv2.weight.data),
55 |             variables.bn2.weight: to_numpy(clip_model.bn2.weight.data),
56 |             variables.bn2.bias: to_numpy(clip_model.bn2.bias.data),
57 |             variables.bn2.running_mean: to_numpy(clip_model.bn2.running_mean.data),
58 |             variables.bn2.running_var: to_numpy(clip_model.bn2.running_var.data),
59 |             variables.conv3.weight: to_numpy(clip_model.conv3.weight.data),
60 |             variables.bn3.weight: to_numpy(clip_model.bn3.weight.data),
61 |             variables.bn3.bias: to_numpy(clip_model.bn3.bias.data),
62 |             variables.bn3.running_mean: to_numpy(clip_model.bn3.running_mean.data),
63 |             variables.bn3.running_var: to_numpy(clip_model.bn3.running_var.data),
64 |         }
65 |         return state_dict
66 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/gptj/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from .attention import *
 4 | from .decoder import *
 5 | from .embedding import *
 6 | from .feed_forward import *
 7 | from .gptj_lm import *
 8 | from .gptj_model import *
 9 | from .finetuneanon_mapping import *
10 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/gptj/feed_forward.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from typing import Optional, List, Dict
 3 | import torch
 4 | import popxl
 5 | from popxl import ops
 6 | from popxl.utils import to_numpy
 7 | from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoMLP as HFModel
 8 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF
 9 | 
10 | import popxl_addons as addons
11 | from configs import GPTJConfig
12 | from popxl_addons.named_tensors import NamedTensorData
13 | from popxl_addons.layers import Linear, LayerNorm
14 | import numpy as np
15 | 
16 | from popxl_addons.named_tensors import NamedTensors
17 | from popxl_addons.ops.replicated_all_reduce_TP import (
18 |     replicated_all_reduce_identical_inputs,
19 |     replicated_all_reduce_identical_grad_inputs,
20 | )
21 | from popxl_addons.array_munging import shard
22 | 
23 | 
24 | class GPTJFeedForwardTP(addons.Module):
25 |     def __init__(self, config: GPTJConfig, ff_size: Optional[int] = None):
26 |         super().__init__()
27 |         self.config = config
28 |         tp = config.execution.tensor_parallel
29 |         self.n_shards = tp
30 |         self.replica_grouping = popxl.gcg().ir.replica_grouping(stride=tp, group_size=1)
31 |         # Also known as the intermediate size
32 |         self.ff_size = 4 * config.hidden_size if ff_size is None else ff_size
33 |         assert self.ff_size % self.n_shards == 0
34 |         # ----- Layers -----
35 |         # Sharded across devices - column wise
36 |         self.intermediate = Linear(self.ff_size // self.n_shards, replica_grouping=self.replica_grouping)
37 | 
38 |         # Sharded across devices - row wise (bias applied separately)
39 |         self.output = Linear(config.hidden_size, bias=False, replica_grouping=self.replica_grouping)
40 | 
41 |     def build(self, x: popxl.Tensor) -> List[popxl.Tensor]:
42 |         """Identical input (x,) and identical output across shards."""
43 |         # ----- Identical computation -----
44 |         z = replicated_all_reduce_identical_inputs(x, group=self.replica_grouping.transpose())
45 | 
46 |         # ----- Sharded computation -----
47 | 
48 |         # Shard column-wise since gelu is not linear.
49 |         # Indeed, sharding row wise requires a sum AllReduce at the end,
50 |         # but gelu is not linear: gelu(x+y) != gelu(x) + gelu(y)
51 |         z = self.intermediate(z)
52 |         z = ops.gelu(z)
53 |         # Here, x is already sharded across devices. Since we don't have non linearities,
54 |         # we can shard row-wise (which requires both X and the weight matrix to be sharded)
55 |         # and then perform an all reduce
56 |         z = self.output(z)
57 | 
58 |         z = replicated_all_reduce_identical_grad_inputs(z, group=self.replica_grouping.transpose())
59 | 
60 |         # ----- Identical computation -----
61 | 
62 |         # Output linear layer bias (identical bias on all devices)
63 |         self.bias = self.add_variable_input("bias", lambda: np.zeros(z.shape[-1]), z.dtype)
64 |         z = z + self.bias
65 | 
66 |         return z
67 | 
68 |     @staticmethod
69 |     def finetuneanon_mapping(
70 |         config: GPTJConfig, variables: NamedTensors, hf_model: HFModel
71 |     ) -> Dict[popxl.Tensor, np.ndarray]:
72 |         dtype = config.dtype
73 |         n_shards = config.execution.tensor_parallel
74 | 
75 |         return {
76 |             variables.intermediate.weight: shard(to_numpy(hf_model.c_fc.weight.data.T, dtype), n_shards, axis=-1),
77 |             variables.intermediate.bias: shard(to_numpy(hf_model.c_fc.bias.data, dtype), n_shards, axis=-1),
78 |             variables.output.weight: shard(to_numpy(hf_model.c_proj.weight.data.T, dtype), n_shards, axis=0),
79 |             variables.bias: to_numpy(hf_model.c_proj.bias.data, dtype),
80 |         }
81 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/gptj/finetuneanon_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from typing import Dict
 4 | import numpy as np
 5 | 
 6 | from transformers.models.gpt_neo import GPTNeoForCausalLM as HFLMHeadModel
 7 | from transformers.models.gpt_neo import GPTNeoModel as HFModel
 8 | 
 9 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF
10 | 
11 | import popxl
12 | from popxl_addons import TaskSession
13 | 
14 | from configs import GPTJConfig
15 | from modelling.gptj.gptj_model import GPTJModelTP
16 | 
17 | 
18 | def finetuneanon_mapping_lm_tp(
19 |     config: GPTJConfig, session: TaskSession, pretrained: HFLMHeadModel
20 | ) -> Dict[popxl.Tensor, np.ndarray]:
21 |     weights = GPTJLMHeadModelTP.finetuneanon_mapping(config, session.state.fwd, pretrained)
22 |     return weights
23 | 
24 | 
25 | def finetuneanon_mapping_tp(
26 |     config: GPTJConfig, session: TaskSession, pretrained: HFModel
27 | ) -> Dict[popxl.Tensor, np.ndarray]:
28 |     weights = GPTJModelTP.finetuneanon_mapping(config, session.state.fwd, pretrained)
29 |     return weights
30 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/gptj/gptj_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | import numpy as np
 3 | from typing import Dict
 4 | from configs import GPTJConfig
 5 | import torch
 6 | 
 7 | import popxl
 8 | from popxl.utils import to_numpy
 9 | 
10 | import popxl_addons as addons
11 | from popxl_addons import NamedTensors
12 | from popxl_addons.named_tensors import NamedTensorData
13 | 
14 | from popxl_addons.layers import LayerNorm
15 | 
16 | from .embedding import GPTJEmbeddingsTP
17 | from .decoder import GPTJDecoderTP, GPTJDecoderBlockTP
18 | 
19 | from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoModel as HFModel
20 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig as GPTJConfigHF
21 | 
22 | 
23 | class GPTJModelTP(addons.Module):
24 |     def __init__(self, config: GPTJConfig, include_layer_norm=True):
25 |         super().__init__()
26 |         self.config = config
27 |         # sharded, then last bit identical
28 |         self.embeddings = GPTJEmbeddingsTP(self.config)
29 |         # identical inputs, then sharded, then identical
30 |         self.decoder = GPTJDecoderTP(self.config)
31 |         # identical
32 |         self.include_layer_norm = include_layer_norm
33 |         if self.include_layer_norm:
34 |             self.ln_f = LayerNorm()
35 | 
36 |     def build(self, input_ids: popxl.Tensor):
37 |         x = self.embeddings(input_ids)
38 |         x = self.decoder(x)
39 |         if self.include_layer_norm:
40 |             x = self.ln_f(x)
41 |         return x
42 | 
43 |     @staticmethod
44 |     def finetuneanon_mapping(
45 |         config: GPTJConfig, variables: NamedTensors, hf_model: HFModel, layer_norm=True, from_magma: bool = True
46 |     ) -> Dict[popxl.Tensor, np.ndarray]:
47 |         dtype = config.dtype
48 |         weights = {}
49 |         if layer_norm:
50 |             weights = {
51 |                 variables.ln_f.weight: to_numpy(hf_model.ln_f.weight.data, dtype),
52 |                 variables.ln_f.bias: to_numpy(hf_model.ln_f.bias.data, dtype),
53 |             }
54 | 
55 |         weights.update(GPTJEmbeddingsTP.finetuneanon_mapping(config, variables.embeddings, hf_model))
56 | 
57 |         for l in range(config.layers):
58 |             weights.update(
59 |                 GPTJDecoderBlockTP.finetuneanon_mapping(
60 |                     config, variables.decoder[l], hf_model.h[l], from_magma=from_magma
61 |                 )
62 |             )
63 | 
64 |         return weights
65 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/image_prefix.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from popxl_addons import Module, NamedTensors
 3 | from popxl_addons.layers import Linear, LayerNorm
 4 | import popxl
 5 | from popxl import Tensor
 6 | from popxl import ops
 7 | from popxl.utils import to_numpy
 8 | 
 9 | from configs import MagmaConfig
10 | from modelling.clip_resnet.modified_resnet import ModifiedResNet
11 | import numpy as np
12 | 
13 | 
14 | class ImagePrefix(Module):
15 |     def __init__(self, config: MagmaConfig):
16 |         """
17 |         Takes in a batch of images and returns a batch of embeddings of the
18 |         same dimensions as the LM's word embeddings.
19 |         """
20 |         super().__init__()
21 |         self.config = config
22 |         proj_out_dim = self.config.transformer.hidden_size
23 |         # project to the language model hidden_dim
24 |         self.proj = Linear(proj_out_dim)
25 |         self.enc = ModifiedResNet(config.visual, pool=False)
26 |         self.ln = LayerNorm()
27 | 
28 |     def build(self, x: popxl.Tensor) -> popxl.Tensor:
29 |         # pass through image encoder
30 |         #: (b, channels, h, w)
31 |         embed = self.enc(x)
32 |         #: b h w d
33 |         assert len(embed.shape) == 4
34 |         if embed.shape[1] == 1:
35 |             embed = ops.squeeze(embed, [1, 2])
36 |         else:
37 |             #: b (h w) d
38 |             embed = embed.reshape((*embed.shape[:2], embed.shape[2] * embed.shape[3])).transpose((0, 2, 1))
39 |         #: b (h w) d -> b ( h w ) proj_out_dim = b ( h w ) lm_hidden_size
40 |         embed = self.proj(embed)
41 |         bs, hw, hidden = embed.shape
42 |         embed = embed.reshape((bs * hw, hidden))
43 |         embed = self.ln(embed)
44 |         embed = embed.reshape((bs, hw, hidden))
45 |         return embed
46 | 
47 |     @staticmethod
48 |     def magma_mapping(magma_model, config, variables: NamedTensors):
49 |         state_dict = ModifiedResNet.clip_mapping(magma_model.enc, config.visual, variables.enc, False)
50 |         state_dict.update(
51 |             {
52 |                 variables.ln.weight: to_numpy(magma_model.ln.weight),
53 |                 variables.ln.bias: to_numpy(magma_model.ln.bias),
54 |                 variables.proj.weight: to_numpy(magma_model.proj.weight.T),
55 |                 variables.proj.bias: to_numpy(magma_model.proj.bias),
56 |             }
57 |         )
58 |         return state_dict
59 | 


--------------------------------------------------------------------------------
/multimodal/magma/modelling/magma_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | from typing import Dict
 4 | import numpy as np
 5 | from torch import nn
 6 | 
 7 | from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig
 8 | 
 9 | import popxl
10 | from popxl_addons import TaskSession
11 | 
12 | from configs import MagmaConfig, GPTJConfig, CONFIG_DIR
13 | from modelling.image_prefix import ImagePrefix
14 | from modelling.gptj.gptj_lm import GPTJLMHeadModelTP
15 | 
16 | from magma.image_encoders import clip_encoder
17 | from magma.magma import Magma
18 | import os
19 | 
20 | 
21 | def load_magma(path, config: MagmaConfig, check_config: bool = True) -> nn.Module:
22 |     """
23 |     Loads magma checkpoint.
24 |     """
25 |     model = Magma.from_checkpoint(
26 |         config_path=os.path.join(CONFIG_DIR, "MAGMA_v1.yml"),
27 |         checkpoint_path=path,
28 |         device="cpu",
29 |     )
30 |     if config.visual.precision == "float16":
31 |         model.image_prefix.half()
32 |     if config.transformer.precision == "float16":
33 |         model.lm.half()
34 | 
35 |     if check_config:
36 |         finetuneanon_lm_config_check(config.transformer, model.lm.config)
37 | 
38 |     return model
39 | 
40 | 
41 | def magma_mapping(config: MagmaConfig, session: TaskSession, magma: nn.Module) -> Dict[popxl.Tensor, np.ndarray]:
42 |     weights = ImagePrefix.magma_mapping(magma.image_prefix, config, session.state.fwd.image_prefix)
43 |     weights.update(GPTJLMHeadModelTP.finetuneanon_mapping(config.transformer, session.state.fwd, magma.lm))
44 |     return weights
45 | 
46 | 
47 | def finetuneanon_lm_config_check(config: GPTJConfig, finetuneanon_config: GPTNeoConfig):
48 |     """
49 |     Compare a GPTJConfig with a finetuneanon GPTNeoConfig config and ensure they match.
50 |     Required if loading a pre-trained model
51 |     """
52 |     if finetuneanon_config.jax == False:
53 |         raise ValueError(
54 |             "GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only with jax=True"
55 |         )
56 |     if finetuneanon_config.rotary == False:
57 |         raise ValueError(
58 |             "GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if rotary embedding is used"
59 |         )
60 |     for attn in finetuneanon_config.attention_layers:
61 |         if attn != "global":
62 |             raise ValueError(
63 |                 'GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if "global" attention is used'
64 |             )
65 |     attn_type = finetuneanon_config.attention_types[0][0]
66 |     if attn_type != "global":
67 |         raise ValueError(
68 |             'GPTNeo model in https://github.com/finetuneanon/transformers is equivalent to gptj only if "global" attention is used'
69 |         )
70 | 
71 |     params = [
72 |         ("hidden_size", config.hidden_size, finetuneanon_config.hidden_size),
73 |         ("heads", config.attention.heads, finetuneanon_config.num_heads),
74 |         ("layers", config.layers, finetuneanon_config.num_layers),
75 |         ("vocab_size", config.embedding.real_vocab_size, finetuneanon_config.vocab_size),
76 |         ("rotary_dim", config.attention.rotary_dim, finetuneanon_config.rotary_dim),
77 |     ]
78 | 
79 |     if not all(xl == hf for _, xl, hf in params):
80 |         not_eq_str = ", ".join(f"\n`{name}` not equal, config: {xl}, hf: {hf}" for name, xl, hf in params if xl != hf)
81 |         raise ValueError(
82 |             f"Config does not match the GPTNeo pre-trained model from https://github.com/finetuneanon/transformers. Not matching: {not_eq_str}"
83 |         )
84 | 


--------------------------------------------------------------------------------
/multimodal/magma/requirements.txt:
--------------------------------------------------------------------------------
 1 | --find-links https://download.pytorch.org/whl/torch_stable.html
 2 | 
 3 | numpy
 4 | torch==2.0.1+cpu
 5 | #examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@7cd37a8eccabe88e3741eef2c31bafd4fcd30c4c
 6 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@v3.3
 7 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.1
 8 | pyyaml==5.4.1
 9 | dataclasses
10 | typeguard==2.13.3
11 | scipy~=1.10.1
12 | 
13 | pytest==6.2.5
14 | pytest-pythonpath==0.7.4
15 | 
16 | jupyter
17 | ipywidgets
18 | 
19 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0
20 | git+https://github.com/Aleph-Alpha/magma@4d01e5172115ab4a8f4b4bf8da76dbc08b6cf36c
21 | 


--------------------------------------------------------------------------------
/multimodal/magma/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/multimodal/magma/utils/sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | # Copyright (c) 2022 Aleph Alpha GmbH
 3 | 
 4 | from magma.sampling import top_k_filter, top_p_filter
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | 
 9 | def generate(logits: torch.Tensor, top_k: float = 0.0, top_p: float = 0.9, temperature: float = 0.7):
10 |     # taken from https://github.com/Aleph-Alpha/magma/blob/master/magma/sampling.py
11 |     if temperature == 0.0:
12 |         next_token = torch.argmax(logits, dim=-1, keepdims=True)
13 |     else:
14 |         if top_k > 0:
15 |             logits = top_k_filter(logits, k=top_k)
16 |         if top_p > 0:
17 |             logits = top_p_filter(logits, threshold=top_p)
18 | 
19 |         probs = F.softmax(logits / temperature, dim=-1)
20 |         next_token = torch.multinomial(probs, num_samples=1)
21 |     return next_token
22 | 


--------------------------------------------------------------------------------
/multimodal/magma/utils/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | 
 3 | import argparse
 4 | import atexit
 5 | import logging
 6 | import os
 7 | import random
 8 | import tempfile
 9 | from argparse import ArgumentParser
10 | from pathlib import Path
11 | from typing import Optional, Callable, Tuple, Union, List
12 | 
13 | import numpy as np
14 | import popart
15 | import torch
16 | 
17 | from popxl_addons import GIT_COMMIT as ADDONS_GIT_COMMIT
18 | from popxl_addons.utils import timer
19 | 
20 | from configs import MagmaConfig
21 | from utils.simple_parsing_tools import parse_args_with_presets
22 | import popdist
23 | import sys
24 | 
25 | __all__ = ["set_random_seeds", "magma_config_setup"]
26 | 
27 | 
28 | def set_random_seeds(seed: int) -> None:
29 |     """
30 |     Initialise seeds on host (numpy, torch, random)
31 |     to guarantee deterministic results
32 |     """
33 |     np.random.seed(seed)
34 |     torch.manual_seed(seed)
35 |     random.seed(seed)
36 | 
37 | 
38 | def magma_config_setup(
39 |     config_file: Union[str, Path],
40 |     presets_key: str,
41 |     default: str,
42 |     CLI_args: Optional[str] = None,
43 | ) -> Tuple[MagmaConfig, argparse.Namespace]:
44 |     """Parse command line args and setup random seed, W&B, logging
45 |     Args:
46 |         config_file: Path to config file (yaml)
47 |         presets_key: Which key in the config to use
48 |         default: Default model config
49 |         CLI_args: Extra command line arguments to customise configuration
50 | 
51 |     Returns:
52 |         MagmaConfig, argparse namespace and optional pretrained model
53 |     """
54 | 
55 |     def custom_args(parser: ArgumentParser):
56 |         log_level = os.environ.get("APP_LOG_LEVEL", "INFO")
57 |         parser.add_argument(
58 |             "--log_level",
59 |             choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
60 |             type=str,
61 |             default=log_level,
62 |             help=("Logging level for the app. " "Can also be set using the environment variable `APP_LOG_LEVEL`"),
63 |         )
64 |         # needed for jupyter notebooks
65 |         parser.add_argument("-f", type=str, default="", help=f"jupyter")
66 | 
67 |     config, args = parse_args_with_presets(MagmaConfig, config_file, presets_key, default, custom_args, CLI_args)
68 |     config: MagmaConfig  # type: ignore
69 |     config.validate()
70 | 
71 |     set_random_seeds(config.seed)
72 | 
73 |     logging_setup(args, config)
74 | 
75 |     return config, args
76 | 
77 | 
78 | def logging_setup(args, config):
79 |     """Setup logging"""
80 |     logging.basicConfig(
81 |         level=args.log_level,
82 |         format="%(asctime)s %(levelname)s: %(message)s",
83 |         datefmt="%Y-%m-%d %H:%M:%S",
84 |         stream=sys.stdout,
85 |     )
86 |     logging.info(f"Starting. Process id: {os.getpid()}")
87 |     logging.info(f"Config: {config}")
88 | 


--------------------------------------------------------------------------------
/multimodal/magma/utils/simple_parsing_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from examples_utils.parsing.simple_parsing_tools import *
4 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/api.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 
3 | from utils.trainer import T5Trainer
4 | from utils.pipeline import T5Pipeline
5 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .config import T5Config, Execution
3 | import os
4 | from pathlib import Path
5 | 
6 | CONFIG_DIR = Path(os.path.dirname(__file__))
7 | 
8 | del os, Path
9 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/config/finetuning.yml:
--------------------------------------------------------------------------------
  1 | # -------- Models --------
  2 | xxl: &xxl
  3 |   model:
  4 |     sequence_length: 512
  5 |     embedding:
  6 |       vocab_size: 32128
  7 |     hidden_size: 4096
  8 |     d_ff: 10240
  9 |     layers: 24
 10 |     attention:
 11 |       heads: 64
 12 |       d_kv: 64
 13 |   training:
 14 |     global_batch_size: 128
 15 |     steps: 500
 16 |     optimizer:
 17 |       name: adamw
 18 |       learning_rate:
 19 |         maximum: 5e-6
 20 |         warmup_steps: 10
 21 |       weight_decay: 0.0
 22 | 
 23 | xl: &xl
 24 |   model:
 25 |     sequence_length: 512
 26 |     embedding:
 27 |       vocab_size: 32128
 28 |     hidden_size: 2048
 29 |     d_ff: 5120
 30 |     layers: 24
 31 |     attention:
 32 |       heads: 32
 33 |       d_kv: 64
 34 |   training:
 35 |     global_batch_size: 128
 36 |     steps: 500
 37 |     optimizer:
 38 |       name: adamw
 39 |       learning_rate:
 40 |         maximum: 5e-6
 41 |         warmup_steps: 10
 42 |       weight_decay: 0.01
 43 | 
 44 | tiny: &tiny
 45 |   model:
 46 |     sequence_length: 512
 47 |     embedding:
 48 |       vocab_size: 128
 49 |     hidden_size: 64
 50 |     d_ff: 256
 51 |     layers: 4
 52 |     attention:
 53 |       heads: 4
 54 |       d_kv: 16
 55 |   training:
 56 |     global_batch_size: 16
 57 |     steps: 10
 58 |     optimizer:
 59 |       name: adamw
 60 |       learning_rate:
 61 |         maximum: 1e-5
 62 |         warmup_steps: 0
 63 |       weight_decay: 0.01
 64 | 
 65 | # -------------------------
 66 | 
 67 | 
 68 | # ------- Execution -------
 69 | release:
 70 |   xxl_pod64:
 71 |     <<: *xxl
 72 |     execution:
 73 |       io_tiles: 128
 74 |       micro_batch_size: 1
 75 |       loss_scaling: 1
 76 |       data_parallel: 4
 77 |       tensor_parallel: 16
 78 |       available_memory_proportion: [ 0.2 ]
 79 | 
 80 |   xxl_pod16:
 81 |     <<: *xxl
 82 |     execution:
 83 |       io_tiles: 128
 84 |       micro_batch_size: 1
 85 |       loss_scaling: 1
 86 |       data_parallel: 1
 87 |       tensor_parallel: 16
 88 |       available_memory_proportion: [ 0.2 ]
 89 | 
 90 |   xl_pod16:
 91 |     <<: *xl
 92 |     execution:
 93 |       io_tiles: 128
 94 |       micro_batch_size: 1
 95 |       loss_scaling: 1
 96 |       data_parallel: 2
 97 |       tensor_parallel: 8
 98 |       available_memory_proportion: [ 0.2 ]
 99 | 
100 |   xl_pod8:
101 |     <<: *xl
102 |     execution:
103 |       io_tiles: 128
104 |       micro_batch_size: 1
105 |       loss_scaling: 1
106 |       data_parallel: 1
107 |       tensor_parallel: 8
108 |       available_memory_proportion: [ 0.2 ]
109 | 
110 |   tiny:
111 |     <<: *tiny
112 |     execution:
113 |       io_tiles: 64
114 |       micro_batch_size: 1
115 |       data_parallel: 2
116 |       tensor_parallel: 2
117 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/config/inference.yml:
--------------------------------------------------------------------------------
 1 | # -------- Models --------
 2 | xxl: &xxl
 3 |   model:
 4 |     eval: true
 5 |     sequence_length: 512
 6 |     embedding:
 7 |       vocab_size: 32128
 8 |     hidden_size: 4096
 9 |     d_ff: 10240
10 |     layers: 24
11 |     attention:
12 |       heads: 64
13 |       d_kv: 64
14 | 
15 | xl: &xl
16 |   model:
17 |     eval: true
18 |     sequence_length: 512
19 |     embedding:
20 |       vocab_size: 32128
21 |     hidden_size: 2048
22 |     d_ff: 5120
23 |     layers: 24
24 |     attention:
25 |       heads: 32
26 |       d_kv: 64
27 | 
28 | tiny: &tiny
29 |   model:
30 |     eval: true
31 |     sequence_length: 512
32 |     embedding:
33 |       vocab_size: 128
34 |     hidden_size: 64
35 |     d_ff: 256
36 |     layers: 4
37 |     attention:
38 |       heads: 4
39 |       d_kv: 16
40 | 
41 | # -------------------------
42 | 
43 | # ------- Execution -------
44 | release:
45 |   xxl-mnli:
46 |     <<: *xxl
47 |     execution:
48 |       micro_batch_size: 20
49 |       available_memory_proportion: [ 0.4 ]
50 |       tensor_parallel: 16
51 | 
52 |   xxl:
53 |     <<: *xxl
54 |     execution:
55 |       micro_batch_size: 12
56 |       available_memory_proportion: [ 0.4 ]
57 |       tensor_parallel: 16
58 | 
59 |   xl-mnli:
60 |     <<: *xl
61 |     execution:
62 |       micro_batch_size: 24
63 |       available_memory_proportion: [ 0.4 ]
64 |       tensor_parallel: 8
65 | 
66 |   xl:
67 |     <<: *xl
68 |     execution:
69 |       micro_batch_size: 16
70 |       available_memory_proportion: [ 0.4 ]
71 |       tensor_parallel: 8
72 | 
73 |   tiny:
74 |     <<: *tiny
75 |     execution:
76 |       micro_batch_size: 2
77 |       available_memory_proportion: [ 0.4 ]
78 |       tensor_parallel: 2
79 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/graphs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/imgs/mnli_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/doing-more-with-flan-t5/imgs/mnli_dataset.png


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/modelling/hf_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from typing import Dict
 3 | import numpy as np
 4 | 
 5 | from transformers.models.t5 import T5Model as HFModel
 6 | from transformers.models.t5 import T5ForConditionalGeneration as HFLMHeadModel
 7 | 
 8 | import popxl
 9 | from popxl_addons import TaskSession
10 | 
11 | from config import T5Config
12 | from modelling.t5_model import T5ModelTP
13 | from modelling.t5_lm import T5LMHeadModelTP
14 | 
15 | 
16 | def hf_mapping_lm_tp(
17 |     config: T5Config, session: TaskSession, pretrained: HFLMHeadModel
18 | ) -> Dict[popxl.Tensor, np.ndarray]:
19 |     load_to = session.state
20 |     if "fwd" in session.state:
21 |         load_to = session.state.fwd
22 |     weights = T5LMHeadModelTP.hf_mapping(config, load_to, pretrained)
23 |     return weights
24 | 
25 | 
26 | def hf_mapping_TP(config: T5Config, session: TaskSession, pretrained: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
27 |     load_to = session.state
28 |     if "fwd" in session.state:
29 |         load_to = session.state.fwd
30 |     weights = T5ModelTP.hf_mapping(config, load_to, pretrained)
31 |     return weights
32 | 
33 | 
34 | def load_lm_to_hf(session: TaskSession, hf_model: HFLMHeadModel) -> HFLMHeadModel:
35 |     weights = session.get_named_tensors_data()
36 |     if "fwd" in weights:
37 |         weights = weights.fwd
38 |     state_dict = T5LMHeadModelTP.to_hf(weights, hf_model)
39 |     hf_model.load_state_dict(state_dict)
40 |     return hf_model
41 | 
42 | 
43 | def load_to_hf(session: TaskSession, hf_model: HFModel) -> HFModel:
44 |     weights = session.get_named_tensors_data()
45 |     if "fwd" in weights:
46 |         weights = weights.fwd
47 |     state_dict = T5ModelTP.to_hf(weights, hf_model)
48 |     hf_model.load_state_dict(state_dict)
49 |     return hf_model
50 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/modelling/layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from functools import partial
 3 | from typing import Dict
 4 | import torch
 5 | import popxl
 6 | from popxl import ops
 7 | from popxl.utils import to_numpy
 8 | from transformers.models.t5.modeling_t5 import T5LayerNorm as HFModel
 9 | from transformers.models.t5.configuration_t5 import T5Config as T5ConfigHF
10 | 
11 | import popxl_addons as addons
12 | from config import T5Config
13 | from popxl_addons.named_tensors import NamedTensorData
14 | import numpy as np
15 | 
16 | from popxl_addons.named_tensors import NamedTensors
17 | 
18 | 
19 | class T5LayerNorm(addons.Module):
20 |     def __init__(self, config: T5Config):
21 |         super().__init__()
22 |         self.eps = config.model.eps
23 |         self.dtype = config.model.dtype
24 | 
25 |     def build(self, x: popxl.Tensor) -> popxl.Tensor:
26 |         """
27 |         Build layer normalisation for T5. No bias and no subtraction of mean.
28 |         """
29 |         w = self.add_variable_input("weight", partial(np.ones, x.shape[-1]), self.dtype)
30 | 
31 |         # Perform the computation in float32
32 |         if x.dtype == popxl.float16:
33 |             x = ops.cast(x, popxl.float32)
34 |         variance = ops.mean(x * x, -1, keepdims=True)
35 |         x = x / ops.sqrt(variance + self.eps)
36 | 
37 |         # Cast back down to float16 if needed
38 |         if w.dtype == popxl.float16:
39 |             x = ops.cast(x, popxl.float16)
40 | 
41 |         x = x * w
42 |         return x
43 | 
44 |     @staticmethod
45 |     def hf_mapping(config: T5Config, variables: NamedTensors, hf_model: HFModel) -> Dict[popxl.Tensor, np.ndarray]:
46 |         dtype = config.model.dtype
47 |         weights = {
48 |             variables.weight: to_numpy(hf_model.weight.data, dtype),
49 |         }
50 |         return weights
51 | 
52 |     @staticmethod
53 |     def to_hf(config: T5ConfigHF, popxl_state_dict: NamedTensorData, hf_model: HFModel) -> Dict[str, torch.Tensor]:
54 |         state_dict = {}
55 |         state_dict["weight"] = torch.tensor(popxl_state_dict.weight, dtype=config.torch_dtype)
56 |         return state_dict
57 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.5
2 | pytest-pythonpath==0.7.4
3 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/requirements.txt:
--------------------------------------------------------------------------------
 1 | --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
 2 | 
 3 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
 4 | examples-utils[common] @ git+https://github.com/graphcore/examples-utils.git@latest_stable
 5 | pyyaml==5.4.1
 6 | dataclasses==0.8; python_version < '3.7'
 7 | transformers==4.25.1
 8 | datasets
 9 | evaluate==0.4.0
10 | tfrecord==1.14.1
11 | torch==2.0.1+cpu
12 | numpy
13 | scipy>=1.5.4
14 | more-itertools==8.13.0
15 | wandb==0.12.8
16 | scikit-learn
17 | 
18 | git+https://github.com/graphcore/popxl-addons.git@sdk-release-3.3.0
19 | 
20 | protobuf==3.20.*; python_version > '3.6'
21 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from .simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/utils/simple_parsing_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2 | from examples_utils.parsing.simple_parsing_tools import *
3 | 


--------------------------------------------------------------------------------
/natural-language-processing/doing-more-with-flan-t5/utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 2 | from typing import Dict
 3 | 
 4 | import numpy as np
 5 | import time
 6 | 
 7 | 
 8 | def _linear_schedule(start: int, end: int, interval: int, low: float, high: float) -> Dict[int, float]:
 9 |     update_steps = np.arange(start, end + 1, interval).astype(np.uint32)
10 |     updates = np.linspace(low, high, len(update_steps))
11 |     return dict(zip(update_steps, updates))
12 | 
13 | 
14 | def warmup_schedule(total_steps: int, minimum: float, maximum: float, warmup_steps: int = 0) -> Dict[int, float]:
15 |     """Learning rate schedule with linear warm up and then remains at max.
16 | 
17 |     Linearly increase from `minimum` to `maximum` for `warmup_steps` steps.
18 |     Then constant at the `maximum` learning rate for the remaining steps.
19 | 
20 |     Returns a dict that maps step to learning rate.
21 |     """
22 |     schedule = {}
23 |     if warmup_steps > 0:
24 |         schedule.update(_linear_schedule(0, warmup_steps, 1, minimum, maximum))
25 | 
26 |     schedule.update(_linear_schedule(warmup_steps, total_steps, 1, maximum, maximum))  # maximum to maximum so constant
27 |     return schedule
28 | 
29 | 
30 | class SimpleTimer:
31 |     def __init__(self):
32 |         self._start = None
33 | 
34 |     def start(self):
35 |         self._start = time.perf_counter()
36 | 
37 |     def stop(self):
38 |         self.elapsed = time.perf_counter() - self._start
39 |         self._start = None
40 | 
41 |     def __enter__(self):
42 |         self.start()
43 |         return self
44 | 
45 |     def __exit__(self, *exc_args):
46 |         self.stop()
47 | 


--------------------------------------------------------------------------------
/natural-language-processing/images/bert-pipelining.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/bert-pipelining.png


--------------------------------------------------------------------------------
/natural-language-processing/images/bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/bert.png


--------------------------------------------------------------------------------
/natural-language-processing/images/causal_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/causal_language_modeling.png


--------------------------------------------------------------------------------
/natural-language-processing/images/masked_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/masked_language_modeling.png


--------------------------------------------------------------------------------
/natural-language-processing/images/name_entity_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/name_entity_extraction.png


--------------------------------------------------------------------------------
/natural-language-processing/images/partitioning.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/partitioning.jpg


--------------------------------------------------------------------------------
/natural-language-processing/images/pipelining.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/pipelining.png


--------------------------------------------------------------------------------
/natural-language-processing/images/question_answering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/question_answering.png


--------------------------------------------------------------------------------
/natural-language-processing/images/recomputation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/recomputation.png


--------------------------------------------------------------------------------
/natural-language-processing/images/rts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/rts.png


--------------------------------------------------------------------------------
/natural-language-processing/images/squad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/squad.png


--------------------------------------------------------------------------------
/natural-language-processing/images/summarization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/summarization.png


--------------------------------------------------------------------------------
/natural-language-processing/images/t5_vs_flan_t5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/t5_vs_flan_t5.png


--------------------------------------------------------------------------------
/natural-language-processing/images/text_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/text_classification.png


--------------------------------------------------------------------------------
/natural-language-processing/images/token_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/token_classification.png


--------------------------------------------------------------------------------
/natural-language-processing/images/translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/images/translation.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/bert-pipelining.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/bert-pipelining.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/bert.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/causal_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/causal_language_modeling.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/masked_language_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/masked_language_modeling.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/mt5_oom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/mt5_oom.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/name_entity_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/name_entity_extraction.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/partitioning.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/partitioning.jpg


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/pipelining.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/pipelining.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/question_answering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/question_answering.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/recomputation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/recomputation.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/restart_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/restart_kernel.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/rts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/rts.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/squad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/squad.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/summarization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/summarization.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/text_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/text_classification.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/token_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/token_classification.png


--------------------------------------------------------------------------------
/natural-language-processing/other-use-cases/images/translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/natural-language-processing/other-use-cases/images/translation.png


--------------------------------------------------------------------------------
/packed-bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/__init__.py


--------------------------------------------------------------------------------
/packed-bert/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/models/__init__.py


--------------------------------------------------------------------------------
/packed-bert/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/pipeline/__init__.py


--------------------------------------------------------------------------------
/packed-bert/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/utils/__init__.py


--------------------------------------------------------------------------------
/packed-bert/utils/packing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/packed-bert/utils/packing/__init__.py


--------------------------------------------------------------------------------
/stable-diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | optimum-graphcore==0.7
2 | matplotlib
3 | graphcore-cloud-tools[logger] @ git+https://github.com/graphcore/graphcore-cloud-tools@v0.3
4 | 


--------------------------------------------------------------------------------
/stable-diffusion/sample_images/image_to_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/image_to_image.png


--------------------------------------------------------------------------------
/stable-diffusion/sample_images/inpainting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/inpainting.png


--------------------------------------------------------------------------------
/stable-diffusion/sample_images/text_to_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/text_to_image.png


--------------------------------------------------------------------------------
/stable-diffusion/sample_images/text_to_image_sd2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/stable-diffusion/sample_images/text_to_image_sd2.png


--------------------------------------------------------------------------------
/useful-tips/images/connect-tunnel-from-web-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-from-web-1.png


--------------------------------------------------------------------------------
/useful-tips/images/connect-tunnel-from-web-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-from-web-2.png


--------------------------------------------------------------------------------
/useful-tips/images/connect-tunnel-to-app-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-to-app-1.png


--------------------------------------------------------------------------------
/useful-tips/images/connect-tunnel-to-app-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/connect-tunnel-to-app-2.png


--------------------------------------------------------------------------------
/useful-tips/images/login-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/login-code.png


--------------------------------------------------------------------------------
/useful-tips/images/login-success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/login-success.png


--------------------------------------------------------------------------------
/useful-tips/images/restart_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/restart_kernel.png


--------------------------------------------------------------------------------
/useful-tips/images/tunnel-unregister.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphcore/Gradient-HuggingFace/57e161ae28fc2eca2a69547fe5b2902068ece779/useful-tips/images/tunnel-unregister.png


--------------------------------------------------------------------------------