├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── procthor_converted │ └── README.md └── rplan_converted │ └── README.md ├── generations └── README.md ├── models └── README.md ├── procthor_dataset.py ├── procthor_dataset_convert.py ├── recipes ├── README.md ├── benchmarks │ ├── fmbench │ │ ├── README.md │ │ ├── config.yml │ │ └── img │ │ │ ├── CFT.png │ │ │ ├── instances.png │ │ │ └── latency_vs_tokens.png │ └── inference_throughput │ │ ├── README.md │ │ ├── cloud-api │ │ ├── README.md │ │ └── azure │ │ │ ├── chat_azure_api_benchmark.py │ │ │ ├── input.jsonl │ │ │ ├── parameters.json │ │ │ └── pretrained_azure_api_benchmark.py │ │ ├── on-prem │ │ ├── README.md │ │ └── vllm │ │ │ ├── chat_vllm_benchmark.py │ │ │ ├── input.jsonl │ │ │ ├── parameters.json │ │ │ └── pretrained_vllm_benchmark.py │ │ ├── requirements.txt │ │ └── tokenizer │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json ├── code_llama │ ├── README.md │ ├── code_completion_example.py │ ├── code_completion_prompt.txt │ ├── code_infilling_example.py │ ├── code_infilling_prompt.txt │ └── code_instruct_example.py ├── evaluation │ ├── README.md │ ├── eval.py │ ├── open_llm_eval_prep.sh │ └── open_llm_leaderboard │ │ ├── arc_challeneg_25shots.yaml │ │ ├── hellaswag_10shots.yaml │ │ ├── hellaswag_utils.py │ │ ├── mmlu_5shots.yaml │ │ └── winogrande_5shots.yaml ├── finetuning │ ├── LLM_finetuning_overview.md │ ├── README.md │ ├── datasets │ │ ├── README.md │ │ └── custom_dataset.py │ ├── finetuning.py │ ├── huggingface_trainer │ │ └── peft_finetuning.ipynb │ ├── multi_node.slurm │ ├── multigpu_finetuning.md │ └── singlegpu_finetuning.md ├── inference │ ├── llama_web_ui │ │ ├── Llama2_Gradio.ipynb │ │ ├── README.md │ │ ├── requirements.txt │ │ └── streamlit_llama2.py │ ├── local_inference │ │ ├── README.md │ │ ├── chat_completion │ │ │ ├── chat_completion.py │ │ │ └── chats.json │ │ ├── inference.py │ │ └── samsum_prompt.txt │ └── model_servers │ │ ├── README.md │ │ ├── hf_text_generation_inference │ │ ├── README.md │ │ └── merge_lora_weights.py │ │ ├── llama-on-prem.md │ │ └── vllm │ │ └── inference.py ├── llama_api_providers │ ├── Azure_API_example │ │ └── azure_api_example.ipynb │ ├── OctoAI_API_examples │ │ ├── Getting_to_know_Llama.ipynb │ │ ├── HelloLlamaCloud.ipynb │ │ ├── LiveData.ipynb │ │ ├── Llama2_Gradio.ipynb │ │ ├── RAG_Chatbot_example │ │ │ ├── RAG_Chatbot_Example.ipynb │ │ │ ├── data │ │ │ │ └── Llama Getting Started Guide.pdf │ │ │ ├── requirements.txt │ │ │ └── vectorstore │ │ │ │ └── db_faiss │ │ │ │ ├── index.faiss │ │ │ │ └── index.pkl │ │ └── VideoSummary.ipynb │ ├── Using_Externally_Hosted_LLMs.ipynb │ └── examples_with_aws │ │ ├── Prompt_Engineering_with_Llama_2_On_Amazon_Bedrock.ipynb │ │ ├── ReAct_Llama_3_Bedrock-WK.ipynb │ │ └── getting_started_llama_3_on_amazon_bedrock.ipynb ├── multilingual │ ├── README.md │ ├── extend_tokenizer.py │ ├── imgs │ │ ├── phase1-eval-loss.png │ │ ├── phase1-train-loss.png │ │ ├── phase2-eval-loss.png │ │ └── phase2-train-loss.png │ ├── prepare_data.py │ └── train_tokenizer.py ├── quickstart │ ├── Getting_to_know_Llama.ipynb │ ├── Prompt_Engineering_with_Llama_2.ipynb │ └── Running_Llama3_Anywhere │ │ ├── Running_Llama_on_HF_transformers.ipynb │ │ └── Running_Llama_on_Mac_Windows_Linux.ipynb ├── responsible_ai │ ├── CodeShieldUsageDemo.ipynb │ ├── Purple_Llama_Anyscale.ipynb │ ├── Purple_Llama_OctoAI.ipynb │ ├── README.md │ ├── input_output_guardrails_with_llama.ipynb │ └── llama_guard │ │ ├── README.md │ │ ├── __init__.py │ │ └── inference.py └── use_cases │ ├── LiveData.ipynb │ ├── RAG │ └── HelloLlamaCloud.ipynb │ ├── README.md │ ├── VideoSummary.ipynb │ ├── chatbots │ ├── RAG_chatbot │ │ ├── RAG_Chatbot_Example.ipynb │ │ ├── data │ │ │ └── Llama Getting Started Guide.pdf │ │ ├── requirements.txt │ │ └── vectorstore │ │ │ ├── db_faiss │ │ │ ├── index.faiss │ │ │ └── index.pkl │ │ │ └── mongodb │ │ │ └── rag_mongodb_llama3_huggingface_open_source.ipynb │ ├── messenger_llama │ │ ├── llama_messenger.py │ │ └── messenger_llama3.md │ └── whatsapp_llama │ │ ├── llama_chatbot.py │ │ └── whatsapp_llama3.md │ └── text2sql │ ├── StructuredLlama.ipynb │ ├── csv2db.py │ ├── nba.txt │ ├── nba_roster.db │ └── txt2csv.py ├── requirements.txt ├── requirements_llama3.txt ├── rplan_dataset.py ├── rplan_dataset_convert.py ├── run_generation_procthor.py ├── run_generation_rplan.py ├── run_metric.py └── src ├── __init__.py ├── llama_recipes ├── configs │ ├── __init__.py │ ├── datasets.py │ ├── fsdp.py │ ├── peft.py │ ├── training.py │ └── wandb.py ├── data │ ├── __init__.py │ ├── concatenator.py │ ├── llama_guard │ │ ├── README.md │ │ ├── __init__.py │ │ ├── finetuning_data_formatter.py │ │ └── finetuning_data_formatter_example.py │ └── sampler.py ├── finetuning.py ├── finetuning_bbox.py ├── inference │ ├── __init__.py │ ├── chat_utils.py │ ├── checkpoint_converter_fsdp_hf.py │ ├── llm.py │ ├── model_utils.py │ ├── prompt_format_utils.py │ └── safety_utils.py ├── model_checkpointing │ ├── __init__.py │ └── checkpoint_handler.py ├── policies │ ├── __init__.py │ ├── activation_checkpointing_functions.py │ ├── anyprecision_optimizer.py │ ├── mixed_precision.py │ └── wrapping.py ├── tools │ └── convert_hf_weights_to_llama.py └── utils │ ├── __init__.py │ ├── config_utils.py │ ├── dataset_utils.py │ ├── flop_utils.py │ ├── fsdp_utils.py │ ├── hf_llama_conversion │ ├── README.md │ └── compare_llama_weights.py │ ├── memory_utils.py │ ├── plot_metrics.py │ └── train_utils.py ├── metrics ├── __init__.py ├── file_consistency.py ├── prompt_consistency.py └── self_consistency.py ├── pred ├── __init__.py └── extract_output_json.py └── utils ├── __init__.py ├── bubble_graph.py ├── eval_overall.py ├── eval_sample.py ├── fp_plot ├── __init__.py └── procthorpy │ ├── __init__.py │ ├── plot.py │ └── utils.py ├── json_check ├── __init__.py ├── schema.py └── verify.py ├── json_repair.py ├── plot.py ├── polygon_object.py ├── process_dataset.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DStruct2Design: Data and Benchmarks for Data Structure Driven Generative Floor Plan Design 2 | 3 | ## Paper 4 | Our paper is available [here](https://arxiv.org/abs/2407.15723) 5 | 6 | ### if you use this repository, please cite our work: 7 | ``` 8 | @misc{luo2024dstruct2designdatabenchmarksdata, 9 | title={DStruct2Design: Data and Benchmarks for Data Structure Driven Generative Floor Plan Design}, 10 | author={Zhi Hao Luo and Luis Lara and Ge Ya Luo and Florian Golemo and Christopher Beckham and Christopher Pal}, 11 | year={2024}, 12 | eprint={2407.15723}, 13 | archivePrefix={arXiv}, 14 | primaryClass={cs.CL}, 15 | url={https://arxiv.org/abs/2407.15723}, 16 | } 17 | ``` 18 | 19 | ## Getting Started 20 | 21 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 22 | 23 | ### Prerequisites 24 | 25 | In our paper, we train a LLama3-8B-Instruct model. Training is enabled by [llama-receipe](https://github.com/meta-llama/llama-recipes/tree/main). You can either install llama-receipe, or install from `requirement.txt` 26 | 27 | #### Install with llama-receipe: 28 | ``` 29 | pip install llama-recipes 30 | ``` 31 | 32 | #### Install from requirement.txt: 33 | ``` 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | ## Datasets 38 | 39 | ### ProcTHOR 40 | 41 | You can download the converted ProcTHOR-10K dataset from [here](https://huggingface.co/datasets/ludolara/DStruct2Design) and put it under `datasets/procthor_converted/` 42 | 43 | ### RPLAN 44 | 45 | RPLAN dataset needs to be requested from its [homepage](http://staff.ustc.edu.cn/~fuxm/projects/DeepLayout/). 46 | 47 | After it's obtained, save all the data (pngs) under `datasets/rplan/`. Then run our conversion script to convert it. The converted dataset will be saved under `datasets/rplan_converted/`: 48 | ``` 49 | python scripts/rplan_dataset_convert.py 50 | ``` 51 | 52 | ## Pretrained Weights 53 | 54 | The pretrained PEFT LoRA weights for all of our models can be obtained: 55 | 56 | #### Weights for 4 model variants trained on RPLAN 57 | ``` 58 | https://drive.google.com/file/d/1cAYlEupNUGJefNdwkNaaq7fD3X3_P46D/view?usp=sharing 59 | ``` 60 | 61 | #### Weights for 3 bubble diagram model variants trained on ProcTHOR 62 | ``` 63 | https://drive.google.com/file/d/16cYPK6g_Ho4VbvjvBZIGHMzNTBWzcAZT/view?usp=drive_link 64 | ``` 65 | 66 | 67 | #### Weights for 3 constraint only (no bubble diagram) model variants trained on ProcTHOR 68 | ``` 69 | https://drive.google.com/file/d/13k-pBmhGhYthm4WbHzrRH7WjaSKNkTpq/view?usp=drive_link 70 | ``` 71 | 72 | After download, they can be un-compressed and put under their respective folder under `models/`. 73 | 74 | ## Training 75 | 76 | Alternatively, these weights can be trained from scratch with the following command: 77 | 78 | #### to train on ProcTHOR: 79 | 80 | ``` 81 | python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name meta-llama/Meta-Llama-3-8B-Instruct --batch_size_training 2 --num_epochs 8 --dataset "custom_dataset" --custom_dataset.file "procthor_dataset.py" --use_wandb False --wandb_config.project "floorplans" --output_dir procthor --exprm $EXPRM_VAR --load_peft False --ds_version $BD_VAR --load_peft False 82 | ``` 83 | 84 | here `$BD_VAR` and `$EXPRM_VAR` indicate the model variants to be trained as explained in Section 6.1 of our paper. 85 | 86 | `$BD_VAR` can be set to either `'bd'` or `'non_bd'`, 87 | 88 | `$EXPRM_VAR` can be set to `'specific'`, `'mask'`, or `'preset_mask'` 89 | 90 | #### to train on RPLAN: 91 | 92 | ``` 93 | python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name meta-llama/Meta-Llama-3-8B-Instruct --batch_size_training 2 --num_epochs 5 --dataset "custom_dataset" --custom_dataset.file "rplan_dataset.py" --use_wandb False --wandb_config.project "floorplans" --output_dir rplan --exprm $EXPRM_VAR$ --load_peft False 94 | ``` 95 | 96 | for RPLAN, the model variant is decided by just `$EXPRM_VAR`. 97 | 98 | `$EXPRM_VAR` can be 1 of `'5R'`, `'6R'`, `'7R'`, or `'8R'`. The differences between these variants are explained in Section 6.1 of our paper. 99 | 100 | ## Inference 101 | 102 | To run genneration after the pretrained weights are obtained, do the following: 103 | 104 | (note that you can run greedy or sample generations. In our experiments, we use both, and sampling is done with `num_samples` set to 20). 105 | 106 | #### To run generation on PROCTHOR-trained models: 107 | 108 | ``` 109 | python run_generation_procthor.py --exprm $EXPRM_VAR --num_samples 1 --version $BD_VAR 110 | ``` 111 | 112 | `$BD_VAR` can be set to either `'bd'` or `'non_bd'`, 113 | 114 | `$EXPRM_VAR` can be set to `'specific'`, `'mask'`, or `'preset_mask'` 115 | 116 | It will load the trained model variant according to the variable. 117 | 118 | 119 | #### To run generation on RPLAN-trained models: 120 | 121 | ``` 122 | python run_generation_rplan.py --exprm $EXPRM_VAR --num_samples 1 123 | ``` 124 | 125 | `$EXPRM_VAR` can be 1 of `'5R'`, `'6R'`, `'7R'`, or `'8R'`. 126 | 127 | It will load the trained model variant according to the variable. 128 | 129 | ## Evaluation 130 | 131 | To evaluate generated results saved in `$RESULTS_DIR`, simply run the following command: 132 | 133 | ``` 134 | python run_metric.py $RESULTS_DIR 135 | ``` 136 | 137 | -------------------------------------------------------------------------------- /datasets/procthor_converted/README.md: -------------------------------------------------------------------------------- 1 | converted ProcTHOR dataset goes here -------------------------------------------------------------------------------- /datasets/rplan_converted/README.md: -------------------------------------------------------------------------------- 1 | Converted RPLAN dataset goes here -------------------------------------------------------------------------------- /generations/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | Generated samples should be stored in this folder in order to be evaluated. -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | Store LoRA weights here. 4 | 5 | Our pretrained weights go here as well. 6 | 7 | for our pretrained weights, structure should be the following for easy save/load access and inference: 8 | 9 | models/ 10 | | 11 | |---procthor_weights_BD_variants/ 12 | |---full_prompt/ 13 | |---mask/ 14 | |---preset_mask/ 15 | | 16 | |---procthor_weights_nonBD_variants/ 17 | |---full_prompt/ 18 | |---mask/ 19 | |---preset_mask/ 20 | | 21 | |---rplan/ 22 | |---5R/ 23 | |---6R/ 24 | |---7R/ 25 | |---8R/ 26 | -------------------------------------------------------------------------------- /recipes/README.md: -------------------------------------------------------------------------------- 1 | This folder contains examples organized by topic: 2 | 3 | | Subfolder | Description | 4 | |---|---| 5 | [quickstart](./quickstart)|The "Hello World" of using Llama2, start here if you are new to using Llama2 6 | [multilingual](./multilingual)|Scripts to add a new language to Llama2 7 | [finetuning](./finetuning)|Scripts to finetune Llama2 on single-GPU and multi-GPU setups 8 | [inference](./inference)|Scripts to deploy Llama2 for inference locally and using model servers 9 | [use_cases](./use_cases)|Scripts showing common applications of Llama2 10 | [responsible_ai](./responsible_ai)|Scripts to use PurpleLlama for safeguarding model outputs 11 | [llama_api_providers](./llama_api_providers)|Scripts to run inference on Llama via hosted endpoints 12 | [benchmarks](./benchmarks)|Scripts to benchmark Llama 2 models inference on various backends 13 | [code_llama](./code_llama)|Scripts to run inference with the Code Llama models 14 | [evaluation](./evaluation)|Scripts to evaluate fine-tuned Llama2 models using `lm-evaluation-harness` from `EleutherAI` 15 | 16 | 17 | **Note on using Replicate** 18 | To run some of the demo apps here, you'll need to first sign in with Replicate with your github account, then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while. After the free trial ends, you'll need to enter billing info to continue to use Llama2 hosted on Replicate - according to Replicate's [Run time and cost](https://replicate.com/meta/llama-2-13b-chat) for the Llama2-13b-chat model used in our demo apps, the model "costs $0.000725 per second. Predictions typically complete within 10 seconds." This means each call to the Llama2-13b-chat model costs less than $0.01 if the call completes within 10 seconds. If you want absolutely no costs, you can refer to the section "Running Llama2 locally on Mac" above or the "Running Llama2 in Google Colab" below. 19 | 20 | **Note on using OctoAI** 21 | You can also use [OctoAI](https://octo.ai/) to run some of the Llama demos under [OctoAI_API_examples](./llama_api_providers/OctoAI_API_examples/). You can sign into OctoAI with your Google or GitHub account, which will give you $10 of free credits you can use for a month. Llama2 on OctoAI is priced at [$0.00086 per 1k tokens](https://octo.ai/pricing/) (a ~350-word LLM response), so $10 of free credits should go a very long way (about 10,000 LLM inferences). 22 | 23 | ### [Running Llama2 in Google Colab](https://colab.research.google.com/drive/1-uBXt4L-6HNS2D8Iny2DwUpVS4Ub7jnk?usp=sharing) 24 | To run Llama2 in Google Colab using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), download the quantized Llama2-7b-chat model [here](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf), or follow the instructions above to build it, before uploading it to your Google drive. Note that on the free Colab T4 GPU, the call to Llama could take more than 20 minutes to return; running the notebook locally on M1 MBP takes about 20 seconds. 25 | -------------------------------------------------------------------------------- /recipes/benchmarks/fmbench/img/CFT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/benchmarks/fmbench/img/CFT.png -------------------------------------------------------------------------------- /recipes/benchmarks/fmbench/img/instances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/benchmarks/fmbench/img/instances.png -------------------------------------------------------------------------------- /recipes/benchmarks/fmbench/img/latency_vs_tokens.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/benchmarks/fmbench/img/latency_vs_tokens.png -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/README.md: -------------------------------------------------------------------------------- 1 | # Inference Throughput Benchmarks 2 | In this folder we provide a series of benchmark scripts that apply a throughput analysis for Llama 2 models inference on various backends: 3 | * On-prem - Popular serving frameworks and containers (i.e. vLLM) 4 | * [**WIP**]Cloud API - Popular API services (i.e. Azure Model-as-a-Service) 5 | * [**WIP**]On-device - Popular on-device inference solutions on Android and iOS (i.e. mlc-llm, QNN) 6 | * [**WIP**]Optimization - Popular optimization solutions for faster inference and quantization (i.e. AutoAWQ) 7 | 8 | # Why 9 | There are three major reasons we want to run these benchmarks and share them with our Llama community: 10 | * Provide inference throughput analysis based on real world situation to help you select the best service or deployment for your scenario 11 | * Provide a baseline measurement for validating various optimization solutions on different backends, so we can provide guidance on which solutions work best for your scenario 12 | * Encourage the community to develop benchmarks on top of our works, so we can better quantify the latest proposed solutions combined with current popular frameworks, especially in this crazy fast-moving area 13 | 14 | # Parameters 15 | Here are the parameters (if applicable) that you can configure for running the benchmark: 16 | * **PROMPT** - Prompt sent in for inference (configure the length of prompt, choose from 5, 25, 50, 100, 500, 1k and 2k) 17 | * **MAX_NEW_TOKENS** - Max number of tokens generated 18 | * **CONCURRENT_LEVELS** - Max number of concurrent requests 19 | * **MODEL_PATH** - Model source 20 | * **MODEL_HEADERS** - Request headers 21 | * **SAFE_CHECK** - Content safety check (either Azure service or simulated latency) 22 | * **THRESHOLD_TPS** - Threshold TPS (threshold for tokens per second below which we deem the query to be slow) 23 | * **TOKENIZER_PATH** - Tokenizer source 24 | * **RANDOM_PROMPT_LENGTH** - Random prompt length (for pretrained models) 25 | * **NUM_GPU** - Number of GPUs for request dispatch among multiple containers 26 | * **TEMPERATURE** - Temperature for inference 27 | * **TOP_P** - Top_p for inference 28 | * **MODEL_ENDPOINTS** - Container endpoints 29 | * Model parallelism or model replicas - Load one model into multiple GPUs or multiple model replicas on one instance. More detail in the README files for specific containers. 30 | 31 | You can also configure other model hyperparameters as part of the request payload. 32 | All these parameters are stored in ```parameter.json``` and real prompts are stored in ```input.jsonl```. Running the script will load these configurations. 33 | 34 | 35 | 36 | # Metrics 37 | The benchmark will report these metrics per instance: 38 | * Number of concurrent requests 39 | * P50 Latency(ms) 40 | * P99 Latency(ms) 41 | * Request per second (RPS) 42 | * Output tokens per second 43 | * Output tokens per second per GPU 44 | * Input tokens per second 45 | * Input tokens per second per GPU 46 | * Average tokens per second per request 47 | 48 | We intend to add these metrics in the future: 49 | * Time to first token (TTFT) 50 | 51 | The benchmark result will be displayed in the terminal output and saved as a CSV file (```performance_metrics.csv```) which you can export to spreadsheets. 52 | 53 | # Getting Started 54 | Please follow the ```README.md``` in each subfolder for instructions on how to setup and run these benchmarks. 55 | 56 | -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/cloud-api/README.md: -------------------------------------------------------------------------------- 1 | # Llama-Cloud-API-Benchmark 2 | This folder contains code to run inference benchmark for Llama 2 models on cloud API with popular cloud service providers. The benchmark will focus on overall inference **throughput** for querying the API endpoint for output generation with different level of concurrent requests. Remember that to send queries to the API endpoint, you are required to acquire subscriptions with the cloud service providers and there will be a fee associated with it. 3 | 4 | Disclaimer - The purpose of the code is to provide a configurable setup to measure inference throughput. It is not a representative of the performance of these API services and we do not plan to make comparisons between different API providers. 5 | 6 | 7 | # Azure - Getting Started 8 | To get started, there are certain steps we need to take to deploy the models: 9 | 10 | 11 | * Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE) 12 | 13 | * Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article 14 | * Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) 15 | * Select Llama models from Model catalog 16 | * Deploy with "Pay-as-you-go" 17 | 18 | Once deployed successfully, you should be assigned for an API endpoint and a security key for inference. 19 | For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference. 20 | 21 | Now, replace the endpoint url and API key in ```azure/parameters.json```. For parameter `MODEL_ENDPOINTS`, with chat models the suffix should be `v1/chat/completions` and with pretrained models the suffix should be `v1/completions`. 22 | Note that the API endpoint might implemented a rate limit for token generation in certain amount of time. If you encountered the error, you can try reduce `MAX_NEW_TOKEN` or start with smaller `CONCURRENT_LEVELs`. 23 | 24 | Once everything configured, to run chat model benchmark: 25 | ```python chat_azure_api_benchmark.py``` 26 | 27 | To run pretrained model benchmark: 28 | ```python pretrained_azure_api_benchmark.py``` 29 | 30 | Once finished, the result will be written into a CSV file in the same directory, which can be later imported into dashboard of your choice. 31 | -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "MAX_NEW_TOKEN" : 256, 3 | "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64], 4 | "THRESHOLD_TPS" : 7, 5 | "TOKENIZER_PATH" : "../../tokenizer", 6 | "RANDOM_PROMPT_LENGTH" : 1000, 7 | "TEMPERATURE" : 0.6, 8 | "TOP_P" : 0.9, 9 | "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/completions", 10 | "API_KEY" : "your-auth-key", 11 | "SYS_PROMPT" : "You are a helpful assistant." 12 | } -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/on-prem/README.md: -------------------------------------------------------------------------------- 1 | # Llama-On-Prem-Benchmark 2 | This folder contains code to run inference benchmark for Llama 2 models on-prem with popular serving frameworks. 3 | The benchmark will focus on overall inference **throughput** for running containers on one instance (single or multiple GPUs) that you can acquire from cloud service providers such as Azure and AWS. You can also run this benchmark on local laptop or desktop. 4 | We support benchmark on these serving framework: 5 | * [vLLM](https://github.com/vllm-project/vllm) 6 | 7 | 8 | # vLLM - Getting Started 9 | 10 | To get started, we first need to deploy containers on-prem as a API host. Follow the guidance [here](../../../inference/model_servers/llama-on-prem.md#setting-up-vllm-with-llama-2) to deploy vLLM on-prem. 11 | 12 | Note that in common scenario which overall throughput is important, we suggest you prioritize deploying as many model replicas as possible to reach higher overall throughput and request-per-second (RPS), comparing to deploy one model container among multiple GPUs for model parallelism. Additionally, as deploying multiple model replicas, there is a need for a higher level wrapper to handle the load balancing which here has been simulated in the benchmark scripts. 13 | For example, we have an instance from Azure that has 8xA100 80G GPUs, and we want to deploy the Llama 2 70B chat model, which is around 140GB with FP16. So for deployment we can do: 14 | * 1x70B model parallel on 8 GPUs, each GPU RAM takes around 17.5GB for loading model weights. 15 | * 2x70B models each use 4 GPUs, each GPU RAM takes around 35GB for loading model weights. 16 | * 4x70B models each use 2 GPUs, each GPU RAM takes around 70GB for loading model weights. (Preferred configuration for max overall throughput. Note that you will have 4 endpoints hosted on different ports and the benchmark script will route requests into each model equally) 17 | 18 | Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM. 19 | ``` 20 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8000 21 | CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8001 22 | ``` 23 | Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal. 24 | 25 | ``` 26 | python chat_vllm_benchmark.py 27 | ``` 28 | 29 | If you are going to use [Azure AI content check](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety), then you should install dependencies as shown below in your terminal: 30 | 31 | ``` 32 | pip install azure-ai-contentsafety azure-core 33 | ``` 34 | Besides chat models, we also provide benchmark scripts for running pretrained models for text completion tasks. To better simulate the real traffic, we generate configurable random token prompt as input. In this process, we select vocabulary that is longer than 2 tokens so the generated words are closer to the English, rather than symbols. 35 | However, random token prompts can't be applied for chat model benchmarks, since the chat model expects a valid question. By feeding random prompts, chat models rarely provide answers that is meeting our ```MAX_NEW_TOKEN``` requirement, defeating the purpose of running throughput benchmarks. Hence for chat models, the questions are copied over to form long inputs such as for 2k and 4k inputs. 36 | To run pretrained model benchmark, follow the command below. 37 | ``` 38 | python pretrained_vllm_benchmark.py 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "MAX_NEW_TOKENS" : 256, 3 | "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256], 4 | "MODEL_PATH" : "meta-llama/Llama-2-7b-chat-hf", 5 | "MODEL_HEADERS" : {"Content-Type": "application/json"}, 6 | "SAFE_CHECK" : true, 7 | "THRESHOLD_TPS" : 7, 8 | "TOKENIZER_PATH" : "../../tokenizer", 9 | "RANDOM_PROMPT_LENGTH" : 1000, 10 | "TEMPERATURE" : 0.6, 11 | "TOP_P" : 0.9, 12 | "MODEL_ENDPOINTS" : [ 13 | "http://localhost:8000/v1/chat/completions" 14 | ] 15 | } -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | requests 3 | azure-core 4 | azure-ai-contentsafety 5 | torch 6 | -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "unk_token": { 17 | "content": "", 18 | "lstrip": false, 19 | "normalized": true, 20 | "rstrip": false, 21 | "single_word": false 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/tokenizer/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/benchmarks/inference_throughput/tokenizer/tokenizer.model -------------------------------------------------------------------------------- /recipes/benchmarks/inference_throughput/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_bos_token": true, 3 | "add_eos_token": false, 4 | "bos_token": { 5 | "__type": "AddedToken", 6 | "content": "", 7 | "lstrip": false, 8 | "normalized": true, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "clean_up_tokenization_spaces": false, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "legacy": true, 22 | "use_default_system_prompt": false, 23 | "model_max_length": 1000000000000000019884624838656, 24 | "pad_token": null, 25 | "sp_model_kwargs": {}, 26 | "tokenizer_class": "LlamaTokenizerFast", 27 | "unk_token": { 28 | "__type": "AddedToken", 29 | "content": "", 30 | "lstrip": false, 31 | "normalized": true, 32 | "rstrip": false, 33 | "single_word": false 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /recipes/code_llama/README.md: -------------------------------------------------------------------------------- 1 | # Code Llama 2 | 3 | Code llama was recently released with three flavors, base-model that support multiple programming languages, Python fine-tuned model and an instruction fine-tuned and aligned variation of Code Llama, please read more [here](https://ai.meta.com/blog/code-llama-large-language-model-coding/). Also note that the Python fine-tuned model and 34B models are not trained on infilling objective, hence can not be used for infilling use-case. 4 | 5 | Find the scripts to run Code Llama, where there are two examples of running code completion and infilling. 6 | 7 | **Note** Please find the right model on HF side [here](https://huggingface.co/codellama). 8 | 9 | Make sure to install Transformers from source for now 10 | 11 | ```bash 12 | 13 | pip install git+https://github.com/huggingface/transformers 14 | 15 | ``` 16 | 17 | To run the code completion example: 18 | 19 | ```bash 20 | 21 | python code_completion_example.py --model_name MODEL_NAME --prompt_file code_completion_prompt.txt --temperature 0.2 --top_p 0.9 22 | 23 | ``` 24 | 25 | To run the code infilling example: 26 | 27 | ```bash 28 | 29 | python code_infilling_example.py --model_name MODEL_NAME --prompt_file code_infilling_prompt.txt --temperature 0.2 --top_p 0.9 30 | 31 | ``` 32 | To run the 70B Instruct model example run the following (you'll need to enter the system and user prompts to instruct the model): 33 | 34 | ```bash 35 | 36 | python code_instruct_example.py --model_name codellama/CodeLlama-70b-Instruct-hf --temperature 0.2 --top_p 0.9 37 | 38 | ``` 39 | You can learn more about the chat prompt template [on HF](https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf#chat-prompt) and [original Code Llama repository](https://github.com/facebookresearch/codellama/blob/main/README.md#fine-tuned-instruction-models). HF tokenizer has already taken care of the chat template as shown in this example. 40 | -------------------------------------------------------------------------------- /recipes/code_llama/code_completion_example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | # from accelerate import init_empty_weights, load_checkpoint_and_dispatch 5 | 6 | import fire 7 | import os 8 | import sys 9 | import time 10 | 11 | import torch 12 | from transformers import AutoTokenizer 13 | 14 | from llama_recipes.inference.safety_utils import get_safety_checker 15 | from llama_recipes.inference.model_utils import load_model, load_peft_model 16 | 17 | 18 | def main( 19 | model_name, 20 | peft_model: str=None, 21 | quantization: bool=False, 22 | max_new_tokens =100, #The maximum numbers of tokens to generate 23 | prompt_file: str=None, 24 | seed: int=42, #seed value for reproducibility 25 | do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise. 26 | min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens 27 | use_cache: bool=True, #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. 28 | top_p: float=0.9, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. 29 | temperature: float=0.6, # [optional] The value used to modulate the next token probabilities. 30 | top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering. 31 | repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty. 32 | length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation. 33 | enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api 34 | enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs 35 | enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5 36 | enable_llamaguard_content_safety: bool=False, # Enable safety check with Llama-Guard 37 | use_fast_kernels: bool = True, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels 38 | **kwargs 39 | ): 40 | if prompt_file is not None: 41 | assert os.path.exists( 42 | prompt_file 43 | ), f"Provided Prompt file does not exist {prompt_file}" 44 | with open(prompt_file, "r") as f: 45 | user_prompt = f.read() 46 | else: 47 | print("No user prompt provided. Exiting.") 48 | sys.exit(1) 49 | 50 | # Set the seeds for reproducibility 51 | torch.cuda.manual_seed(seed) 52 | torch.manual_seed(seed) 53 | 54 | model = load_model(model_name, quantization, use_fast_kernels) 55 | if peft_model: 56 | model = load_peft_model(model, peft_model) 57 | 58 | model.eval() 59 | 60 | tokenizer = AutoTokenizer.from_pretrained(model_name) 61 | safety_checker = get_safety_checker(enable_azure_content_safety, 62 | enable_sensitive_topics, 63 | enable_salesforce_content_safety, 64 | enable_llamaguard_content_safety, 65 | ) 66 | 67 | # Safety check of the user prompt 68 | safety_results = [check(user_prompt) for check in safety_checker] 69 | are_safe = all([r[1] for r in safety_results]) 70 | if are_safe: 71 | print("User prompt deemed safe.") 72 | print(f"User prompt:\n{user_prompt}") 73 | else: 74 | print("User prompt deemed unsafe.") 75 | for method, is_safe, report in safety_results: 76 | if not is_safe: 77 | print(method) 78 | print(report) 79 | print("Skipping the inference as the prompt is not safe.") 80 | sys.exit(1) # Exit the program with an error status 81 | 82 | batch = tokenizer(user_prompt, return_tensors="pt") 83 | 84 | batch = {k: v.to("cuda") for k, v in batch.items()} 85 | start = time.perf_counter() 86 | with torch.no_grad(): 87 | outputs = model.generate( 88 | **batch, 89 | max_new_tokens=max_new_tokens, 90 | do_sample=do_sample, 91 | top_p=top_p, 92 | temperature=temperature, 93 | min_length=min_length, 94 | use_cache=use_cache, 95 | top_k=top_k, 96 | repetition_penalty=repetition_penalty, 97 | length_penalty=length_penalty, 98 | **kwargs 99 | ) 100 | e2e_inference_time = (time.perf_counter()-start)*1000 101 | print(f"the inference time is {e2e_inference_time} ms") 102 | output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) 103 | 104 | # Safety check of the model output 105 | safety_results = [check(output_text) for check in safety_checker] 106 | are_safe = all([r[1] for r in safety_results]) 107 | if are_safe: 108 | print("User input and model output deemed safe.") 109 | print(f"Model output:\n{output_text}") 110 | else: 111 | print("Model output deemed unsafe.") 112 | for method, is_safe, report in safety_results: 113 | if not is_safe: 114 | print(method) 115 | print(report) 116 | 117 | 118 | if __name__ == "__main__": 119 | fire.Fire(main) 120 | -------------------------------------------------------------------------------- /recipes/code_llama/code_completion_prompt.txt: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def main(string: str): 4 | print(string) 5 | print(string[::-1]) 6 | 7 | if __name__ == "__main__": -------------------------------------------------------------------------------- /recipes/code_llama/code_infilling_example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | # from accelerate import init_empty_weights, load_checkpoint_and_dispatch 5 | 6 | import fire 7 | import torch 8 | import os 9 | import sys 10 | import time 11 | 12 | from transformers import AutoTokenizer 13 | 14 | from llama_recipes.inference.safety_utils import get_safety_checker 15 | from llama_recipes.inference.model_utils import load_model, load_peft_model 16 | 17 | def main( 18 | model_name, 19 | peft_model: str=None, 20 | quantization: bool=False, 21 | max_new_tokens =100, #The maximum numbers of tokens to generate 22 | prompt_file: str=None, 23 | seed: int=42, #seed value for reproducibility 24 | do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise. 25 | min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens 26 | use_cache: bool=True, #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. 27 | top_p: float=0.9, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. 28 | temperature: float=0.6, # [optional] The value used to modulate the next token probabilities. 29 | top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering. 30 | repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty. 31 | length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation. 32 | enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api 33 | enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs 34 | enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5 35 | enable_llamaguard_content_safety: bool=False, # Enable safety check with Llama-Guard 36 | use_fast_kernels: bool = True, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels 37 | **kwargs 38 | ): 39 | if prompt_file is not None: 40 | assert os.path.exists( 41 | prompt_file 42 | ), f"Provided Prompt file does not exist {prompt_file}" 43 | with open(prompt_file, "r") as f: 44 | user_prompt = f.read() 45 | else: 46 | print("No user prompt provided. Exiting.") 47 | sys.exit(1) 48 | # Set the seeds for reproducibility 49 | torch.cuda.manual_seed(seed) 50 | torch.manual_seed(seed) 51 | 52 | model = load_model(model_name, quantization, use_fast_kernels) 53 | model.config.tp_size=1 54 | if peft_model: 55 | model = load_peft_model(model, peft_model) 56 | 57 | model.eval() 58 | 59 | tokenizer = AutoTokenizer.from_pretrained(model_name) 60 | 61 | safety_checker = get_safety_checker(enable_azure_content_safety, 62 | enable_sensitive_topics, 63 | enable_salesforce_content_safety, 64 | enable_llamaguard_content_safety, 65 | ) 66 | 67 | # Safety check of the user prompt 68 | safety_results = [check(user_prompt) for check in safety_checker] 69 | are_safe = all([r[1] for r in safety_results]) 70 | if are_safe: 71 | print("User prompt deemed safe.") 72 | print(f"User prompt:\n{user_prompt}") 73 | else: 74 | print("User prompt deemed unsafe.") 75 | for method, is_safe, report in safety_results: 76 | if not is_safe: 77 | print(method) 78 | print(report) 79 | print("Skipping the inference as the prompt is not safe.") 80 | sys.exit(1) # Exit the program with an error status 81 | 82 | batch = tokenizer(user_prompt, return_tensors="pt") 83 | batch = {k: v.to("cuda") for k, v in batch.items()} 84 | 85 | start = time.perf_counter() 86 | with torch.no_grad(): 87 | outputs = model.generate( 88 | **batch, 89 | max_new_tokens=max_new_tokens, 90 | do_sample=do_sample, 91 | top_p=top_p, 92 | temperature=temperature, 93 | min_length=min_length, 94 | use_cache=use_cache, 95 | top_k=top_k, 96 | repetition_penalty=repetition_penalty, 97 | length_penalty=length_penalty, 98 | **kwargs 99 | ) 100 | e2e_inference_time = (time.perf_counter()-start)*1000 101 | print(f"the inference time is {e2e_inference_time} ms") 102 | filling = tokenizer.batch_decode(outputs[:, batch["input_ids"].shape[1]:], skip_special_tokens=True)[0] 103 | # Safety check of the model output 104 | safety_results = [check(filling) for check in safety_checker] 105 | are_safe = all([r[1] for r in safety_results]) 106 | if are_safe: 107 | print("User input and model output deemed safe.") 108 | print(user_prompt.replace("", filling)) 109 | else: 110 | print("Model output deemed unsafe.") 111 | for method, is_safe, report in safety_results: 112 | if not is_safe: 113 | print(method) 114 | print(report) 115 | 116 | 117 | if __name__ == "__main__": 118 | fire.Fire(main) 119 | -------------------------------------------------------------------------------- /recipes/code_llama/code_infilling_prompt.txt: -------------------------------------------------------------------------------- 1 | def remove_non_ascii(s: str) -> str: 2 | """ 3 | return result 4 | -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_eval_prep.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | #!/bin/bash 5 | 6 | # Prompt the user for the EVAL_PATH 7 | read -p "Enter the asbolute path to the lm-evaluation-harness: " EVAL_PATH 8 | conda activate 9 | # Directory containing YAML files 10 | DIR="open_llm_leaderboard" 11 | 12 | # Check if the directory exists 13 | if [ ! -d "$DIR" ]; then 14 | echo "Error: Directory '$DIR' not found." 15 | exit 1 16 | fi 17 | 18 | # Iterate over YAML files in the directory and update them 19 | for YAML_FILE in "$DIR"/*.yaml 20 | do 21 | if [ -f "$YAML_FILE" ]; then 22 | sed -i 's|{\$EVAL_PATH}|'"$EVAL_PATH"'|g' "$YAML_FILE" 23 | echo "Updated $YAML_FILE with EVAL_PATH: $EVAL_PATH" 24 | fi 25 | done 26 | -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_leaderboard/arc_challeneg_25shots.yaml: -------------------------------------------------------------------------------- 1 | include: {$EVAL_PATH}/lm_eval/tasks/arc/arc_challenge.yaml 2 | task: arc_challenge_25_shot 3 | task_alias: arc 25 shot 4 | num_fewshot: 25 5 | metric_list: 6 | - metric: acc_norm 7 | -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_leaderboard/hellaswag_10shots.yaml: -------------------------------------------------------------------------------- 1 | include: {$EVAL_PATH}/lm_eval/tasks/hellaswag/hellaswag.yaml 2 | task: hellaswag_10_shot 3 | task_alias: hellaswag 10 shot 4 | num_fewshot: 10 5 | metric_list: 6 | - metric: acc_norm 7 | -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_leaderboard/hellaswag_utils.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import re 3 | 4 | 5 | def preprocess(text): 6 | text = text.strip() 7 | # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. 8 | text = text.replace(" [title]", ". ") 9 | text = re.sub("\\[.*?\\]", "", text) 10 | text = text.replace(" ", " ") 11 | return text 12 | 13 | 14 | def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: 15 | def _process_doc(doc): 16 | ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() 17 | out_doc = { 18 | "query": preprocess(doc["activity_label"] + ": " + ctx), 19 | "choices": [preprocess(ending) for ending in doc["endings"]], 20 | "gold": int(doc["label"]), 21 | } 22 | return out_doc 23 | 24 | return dataset.map(_process_doc) 25 | -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_leaderboard/mmlu_5shots.yaml: -------------------------------------------------------------------------------- 1 | include: {$EVAL_PATH}/lm_eval/tasks/mmlu/default/_mmlu.yaml 2 | task: 3 | - mmlu_stem 4 | - mmlu_other 5 | - mmlu_social_sciences 6 | - mmlu_humanities 7 | num_fewshot: 5 8 | metric_list: 9 | - metric: acc -------------------------------------------------------------------------------- /recipes/evaluation/open_llm_leaderboard/winogrande_5shots.yaml: -------------------------------------------------------------------------------- 1 | include: {$EVAL_PATH}/lm_eval/tasks/winogrande/default.yaml 2 | task: winogrande_5_shot 3 | task_alias: winogrande 5 shot 4 | num_fewshot: 5 5 | metric_list: 6 | - metric: acc 7 | -------------------------------------------------------------------------------- /recipes/finetuning/LLM_finetuning_overview.md: -------------------------------------------------------------------------------- 1 | ## LLM Fine-Tuning 2 | 3 | Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here: 4 | 5 | 6 | ## 1. **Parameter Efficient Model Fine-Tuning** 7 | This helps make the fine-tuning process more affordable even on 1 consumer grade GPU. These methods enable us to keep the whole model frozen and to just add tiny learnable parameters/ layers into the model. In this way, we just train a very tiny portion of the parameters. The most famous method in this category is [LORA](https://arxiv.org/pdf/2106.09685.pdf), Llama Adapter and Prefix-tuning. 8 | 9 | 10 | These methods will address three aspects: 11 | 12 | 13 | - **Cost of full fine-tuning** – these methods only train a small set of extra parameters instead of the full model, this makes it possible to run these on consumer GPUs. 14 | 15 | - **Cost of deployment** – for each fine-tuned downstream model we need to deploy a separate model; however, when using these methods, only a small set of parameters (few MB instead of several GBs) of the pretrained model can do the job. In this case, for each task we only add these extra parameters on top of the pretrained model so pretrained models can be assumed as backbone and these parameters as heads for the model on different tasks. 16 | 17 | - **Catastrophic forgetting** — these methods also help with forgetting the first task that can happen in finetuning. 18 | 19 | HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft). 20 | 21 | 22 | 23 | ## 2. **Full/ Partial Parameter Fine-Tuning** 24 | 25 | Full parameter fine-tuning has its own advantages, in this method there are multiple strategies that can help: 26 | 27 | - Keep the pretrained model frozen and only fine-tune the task head for example, the classifier model. 28 | 29 | 30 | - Keep the pretrained model frozen and add a few fully connected layers on the top. 31 | 32 | 33 | - Fine-tuning on all the layers. 34 | 35 | You can also keep most of the layers frozen and only fine-tune a few layers. There are many different techniques to choose from to freeze/unfreeze layers based on different criteria. 36 | 37 |
38 | Image 1 39 | Image 2 40 | Image 3 41 |
42 | 43 | 44 | 45 | In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Meta Llama 3 8B parameter won't fit into one gpu. 46 | The way you want to think about it is, you would need enough GPU memory to keep model parameters, gradients and optimizer states. Where each of these, depending on the precision you are training, can take up multiple times of your parameter count x precision( depending on if its fp32/ 4 bytes, fp16/2 bytes/ bf16/2 bytes). 47 | For example AdamW optimizer keeps 2 parameters for each of your parameters and in many cases these are kept in fp32. This implies that depending on how many layers you are training/ unfreezing your GPU memory can grow beyond one GPU. 48 | 49 | **FSDP (Fully Sharded Data Parallel)** 50 | 51 | 52 | Pytorch has the FSDP package for training models that do not fit into one GPU. FSDP lets you train a much larger model with the same amount of resources. Prior to FSDP was DDP (Distributed Data Parallel) where each GPU was holding a full replica of the model and would only shard the data. At the end of backward pass it would sync up the gradients. 53 | 54 | FSDP extends this idea, not only sharding the data but also model parameters, gradients and optimizer states. This means each GPU will only keep one shard of the model. This will result in huge memory savings that enable us to fit a much larger model into the same number of GPU. As an example in DDP the most you could fit into a GPU with 16GB memory is a model around 700M parameters. So, suppose you had 4 GPUs, in this case even though you access 4 GPUs, you still can't scale beyond the model size that can fit into one GPU. However with FSDP you can fit a 3B model into 4 GPUs, > 4x larger model. 55 | 56 | 57 | Please read more on FSDP [here](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) & get started with FSDP [here](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html). 58 | 59 | 60 | To boost the performance of fine-tuning with FSDP, we can make use a number of features such as: 61 | 62 | - **Mixed Precision** which in FSDP is much more flexible compared to Autocast. It gives user control over setting precision for model parameters, buffers and gradients. 63 | 64 | - **Activation Checkpointing** which is a technique to save memory by discarding the intermediate activation in forward pass instead of keeping it in the memory with the cost recomputing them in the backward pass. FSDP Activation checkpointing is shard aware meaning we need to apply it after wrapping the model with FSDP. In our script we are making use of that. 65 | 66 | - **auto_wrap_policy** Which is the way to specify how FSDP would partition the model, there is default support for transformer wrapping policy. This allows FSDP to form each FSDP unit ( partition of the model ) based on the transformer class in the model. To identify this layer in the model, you need to look at the layer that wraps both the attention layer and MLP. This helps FSDP have more fine-grained units for communication that help with optimizing the communication cost. 67 | -------------------------------------------------------------------------------- /recipes/finetuning/datasets/custom_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | # For dataset details visit: https://huggingface.co/datasets/samsum 5 | 6 | import copy 7 | import datasets 8 | import itertools 9 | 10 | 11 | B_INST, E_INST = "[INST]", "[/INST]" 12 | 13 | def tokenize_dialog(dialog, tokenizer): 14 | if tokenizer.vocab_size >= 128000: 15 | dialog_tokens = tokenizer.apply_chat_template(dialog) 16 | dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n 17 | eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009] 18 | labels = copy.copy(dialog_tokens) 19 | last_idx = 0 20 | for n, idx in enumerate(eot_indices): 21 | if n % 2 == 1: 22 | last_idx = idx 23 | else: 24 | labels[last_idx:idx+1] = [-100] * (idx-last_idx+1) 25 | 26 | dialog_tokens = [dialog_tokens] 27 | labels_tokens = [labels] 28 | else: 29 | prompt_tokens = [tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(prompt['content']).strip()} {E_INST}", add_special_tokens=False) for prompt in dialog[::2]] 30 | answer_tokens = [tokenizer.encode(f"{answer['content'].strip()} {tokenizer.eos_token}", add_special_tokens=False) for answer in dialog[1::2]] 31 | dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens))) 32 | 33 | #Add labels, convert prompt token to -100 in order to ignore in loss function 34 | labels_tokens = [len(c)*[-100,] if i % 2 == 0 else c for i,c in enumerate(dialog_tokens)] 35 | 36 | combined_tokens = { 37 | "input_ids": list(itertools.chain(*(t for t in dialog_tokens))), 38 | "labels": list(itertools.chain(*(t for t in labels_tokens))), 39 | } 40 | 41 | return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"])) 42 | 43 | 44 | def get_custom_dataset(dataset_config, tokenizer, split): 45 | dataset = datasets.load_dataset("OpenAssistant/oasst1", split=split) 46 | 47 | dataset = dataset.map(lambda sample: { 48 | "message_id": sample["message_id"], 49 | "parent_id": sample["parent_id"], 50 | "text": sample["text"], 51 | }, 52 | batched=True, 53 | remove_columns=list(dataset.features),) 54 | 55 | nodes = {} 56 | 57 | messages = {} 58 | root_ids = [] 59 | 60 | for data in dataset: 61 | if data["parent_id"]: 62 | nodes[data["parent_id"]] = nodes.get(data["parent_id"], []) + [data["message_id"]] 63 | else: 64 | root_ids.append(data["message_id"]) 65 | messages[data["message_id"]]=data["text"] 66 | 67 | def follow(thread, current_id): 68 | thread = copy.copy(thread) + [messages[current_id]] 69 | if current_id in nodes: 70 | new_threads = [] 71 | for next_id in nodes[current_id]: 72 | new_threads += follow(thread, next_id) 73 | return new_threads 74 | else: 75 | return [thread] 76 | 77 | def get_threads_from_root(root_id): 78 | all_threads = [] 79 | thread = [messages[root_id]] 80 | for cid in nodes[root_id]: 81 | all_threads += follow(thread, cid) 82 | return all_threads 83 | 84 | dataset = dataset.filter(lambda x: x["message_id"] in root_ids) 85 | dataset = dataset.map(lambda x: {"thread": get_threads_from_root(x["message_id"])}, remove_columns=list(dataset.features)) 86 | dataset = dataset.map(lambda x: {"thread": [i for row in x["thread"] for i in row]}, batched=True) 87 | 88 | def to_dialog(thread): 89 | dialog = [] 90 | for i, content in enumerate(thread): 91 | dialog.append({ 92 | "role": "user" if i % 2 == 0 else "assistant", 93 | "content": content, 94 | }) 95 | return {"dialog": dialog} 96 | 97 | dataset = dataset.map(lambda x: to_dialog(x["thread"]), remove_columns=list(dataset.features)) 98 | dataset = dataset.map(lambda x: tokenize_dialog(x["dialog"], tokenizer), remove_columns=list(dataset.features)) 99 | 100 | return dataset 101 | -------------------------------------------------------------------------------- /recipes/finetuning/finetuning.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import fire 5 | from llama_recipes.finetuning import main 6 | 7 | if __name__ == "__main__": 8 | fire.Fire(main) -------------------------------------------------------------------------------- /recipes/finetuning/multi_node.slurm: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | 5 | #!/bin/bash 6 | 7 | #SBATCH --job-name=Nano-2d-trainer-20b-8nodes 8 | 9 | #SBATCH --ntasks=2 10 | #SBATCH --nodes=2 11 | #SBATCH --gpus-per-task=4 12 | #SBATCH --partition=train 13 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) 14 | nodes_array=($nodes) 15 | head_node=${nodes_array[0]} 16 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 17 | # Enable for A100 18 | export FI_PROVIDER="efa" 19 | 20 | echo Node IP: $head_node_ip 21 | export LOGLEVEL=INFO 22 | # debugging flags (optional) 23 | export NCCL_DEBUG=WARN 24 | export NCCL_DEBUG_SUBSYS=WARN 25 | export PYTHONFAULTHANDLER=1 26 | export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH 27 | export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH 28 | export CUDA_LAUNCH_BLOCKING=0 29 | 30 | # on your cluster you might need these: 31 | # set the network interface 32 | export NCCL_SOCKET_IFNAME="ens" 33 | export FI_EFA_USE_DEVICE_RDMA=1 34 | 35 | srun torchrun --nproc_per_node 4 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 ./finetuning.py --enable_fsdp --use_peft --peft_method lora 36 | 37 | -------------------------------------------------------------------------------- /recipes/finetuning/singlegpu_finetuning.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning with Single GPU 2 | This recipe steps you through how to finetune a Meta Llama 3 model on the text summarization task using the [samsum](https://huggingface.co/datasets/samsum) dataset on a single GPU. 3 | 4 | These are the instructions for using the canonical [finetuning script](../../src/llama_recipes/finetuning.py) in the llama-recipes package. 5 | 6 | 7 | ## Requirements 8 | 9 | Ensure that you have installed the llama-recipes package ([details](../../README.md#installing)). 10 | 11 | To run fine-tuning on a single GPU, we will make use of two packages: 12 | 1. [PEFT](https://github.com/huggingface/peft) to use parameter-efficient finetuning. 13 | 2. [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) for int8 quantization. 14 | 15 | 16 | ## How to run it? 17 | 18 | ```bash 19 | python -m finetuning.py --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model 20 | ``` 21 | The args used in the command above are: 22 | 23 | * `--use_peft` boolean flag to enable PEFT methods in the script 24 | * `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`. 25 | * `--quantization` boolean flag to enable int8 quantization 26 | 27 | > [!NOTE] 28 | > In case you are using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id`. 29 | 30 | 31 | ### How to run with different datasets? 32 | 33 | Currently 3 open source datasets are supported that can be found in [Datasets config file](../../src/llama_recipes/configs/datasets.py). You can also use your custom dataset (more info [here](./datasets/README.md)). 34 | 35 | * `grammar_dataset` : use this [notebook](../../src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) to pull and process the Jfleg and C4 200M datasets for grammar checking. 36 | 37 | * `alpaca_dataset` : to get this open source data please download the `aplaca.json` to `dataset` folder. 38 | 39 | 40 | ```bash 41 | wget -P ../../src/llama_recipes/datasets https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json 42 | ``` 43 | 44 | * `samsum_dataset` 45 | 46 | to run with each of the datasets set the `dataset` flag in the command as shown below: 47 | 48 | ```bash 49 | # grammer_dataset 50 | 51 | python -m finetuning.py --use_peft --peft_method lora --quantization --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model 52 | 53 | # alpaca_dataset 54 | 55 | python -m finetuning.py --use_peft --peft_method lora --quantization --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model 56 | 57 | 58 | # samsum_dataset 59 | 60 | python -m finetuning.py --use_peft --peft_method lora --quantization --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model 61 | 62 | ``` 63 | 64 | ## FLOPS Counting and Pytorch Profiling 65 | 66 | To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. 67 | 68 | Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. 69 | -------------------------------------------------------------------------------- /recipes/inference/llama_web_ui/Llama2_Gradio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e4532411", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "47a9adb3", 16 | "metadata": {}, 17 | "source": [ 18 | "## This demo app shows how to query Llama 2 using the Gradio UI.\n", 19 | "\n", 20 | "Since we are using Replicate in this example, you will need to replace `` with your API token.\n", 21 | "\n", 22 | "To get the Replicate token: \n", 23 | "\n", 24 | "- You will need to first sign in with Replicate with your github account\n", 25 | "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n", 26 | "\n", 27 | "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n", 28 | "\n", 29 | "To run this example:\n", 30 | "- Set up your Replicate API token and enter it in place of ``\n", 31 | "- Run the notebook\n", 32 | "- Enter your question and click Submit\n", 33 | "\n", 34 | "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "id": "928041cc", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "Init param `input` is deprecated, please use `model_kwargs` instead.\n" 48 | ] 49 | }, 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "Running on local URL: http://127.0.0.1:7860\n", 55 | "\n", 56 | "To create a public link, set `share=True` in `launch()`.\n" 57 | ] 58 | }, 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
" 63 | ], 64 | "text/plain": [ 65 | "" 66 | ] 67 | }, 68 | "metadata": {}, 69 | "output_type": "display_data" 70 | }, 71 | { 72 | "data": { 73 | "text/plain": [] 74 | }, 75 | "execution_count": 1, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "from langchain.schema import AIMessage, HumanMessage\n", 82 | "import gradio as gr\n", 83 | "from langchain.llms import Replicate\n", 84 | "import os\n", 85 | "\n", 86 | "os.environ[\"REPLICATE_API_TOKEN\"] = \"\"\n", 87 | "\n", 88 | "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", 89 | "\n", 90 | "llm = Replicate(\n", 91 | " model=llama2_13b_chat,\n", 92 | " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n", 93 | ")\n", 94 | "\n", 95 | "\n", 96 | "def predict(message, history):\n", 97 | " history_langchain_format = []\n", 98 | " for human, ai in history:\n", 99 | " history_langchain_format.append(HumanMessage(content=human))\n", 100 | " history_langchain_format.append(AIMessage(content=ai))\n", 101 | " history_langchain_format.append(HumanMessage(content=message))\n", 102 | " gpt_response = llm(message) #history_langchain_format)\n", 103 | " return gpt_response#.content\n", 104 | "\n", 105 | "gr.ChatInterface(predict).launch()" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.8.18" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 5 130 | } 131 | -------------------------------------------------------------------------------- /recipes/inference/llama_web_ui/README.md: -------------------------------------------------------------------------------- 1 | ## Quick Web UI for Llama2 Chat 2 | If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods: 3 | 4 | ### Running [Streamlit](https://streamlit.io/) with Llama2 5 | Open a Terminal, run the following commands: 6 | ``` 7 | pip install streamlit langchain replicate 8 | git clone https://github.com/facebookresearch/llama-recipes 9 | cd llama-recipes/llama-demo-apps 10 | ``` 11 | 12 | Replace the `` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note). 13 | 14 | Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer: 15 | 16 | ![](../../../docs/images/llama2-streamlit.png) 17 | ![](../../../docs/images/llama2-streamlit2.png) 18 | 19 | ### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb)) 20 | 21 | To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note). 22 | 23 | Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI: 24 | 25 | ![](../../../docs/images/llama2-gradio.png) 26 | -------------------------------------------------------------------------------- /recipes/inference/llama_web_ui/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | langchain 3 | replicate -------------------------------------------------------------------------------- /recipes/inference/llama_web_ui/streamlit_llama2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | # TODO REFACTOR: Convert this to an ipynb notebook 5 | 6 | import streamlit as st 7 | from langchain.llms import Replicate 8 | import os 9 | 10 | st.title("Llama2-powered Streamlit App") 11 | 12 | with st.sidebar: 13 | os.environ["REPLICATE_API_TOKEN"] = "" 14 | 15 | def generate_response(input_text): 16 | llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d" 17 | 18 | llm = Replicate( 19 | model=llama2_13b_chat, 20 | model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500} 21 | ) 22 | st.info(llm(input_text)) 23 | 24 | with st.form("my_form"): 25 | text = st.text_area("Enter text:", "What is Generative AI?") 26 | submitted = st.form_submit_button("Submit") 27 | generate_response(text) 28 | -------------------------------------------------------------------------------- /recipes/inference/local_inference/README.md: -------------------------------------------------------------------------------- 1 | # Local Inference 2 | 3 | For local inference we have provided an [inference script](inference.py). Depending on the type of finetuning performed during training the [inference script](inference.py) takes different arguments. 4 | To finetune all model parameters the output dir of the training has to be given as --model_name argument. 5 | In the case of a parameter efficient method like lora the base model has to be given as --model_name and the output dir of the training has to be given as --peft_model argument. 6 | Additionally, a prompt for the model in the form of a text file has to be provided. The prompt file can either be piped through standard input or given as --prompt_file parameter. 7 | 8 | **Content Safety** 9 | The inference script also supports safety checks for both user prompt and model outputs. In particular, we use two packages, [AuditNLG](https://github.com/salesforce/AuditNLG/tree/main) and [Azure content safety](https://pypi.org/project/azure-ai-contentsafety/1.0.0b1/). 10 | 11 | **Note** 12 | If using Azure content Safety, please make sure to get the endpoint and API key as described [here](https://pypi.org/project/azure-ai-contentsafety/1.0.0b1/) and add them as the following environment variables,`CONTENT_SAFETY_ENDPOINT` and `CONTENT_SAFETY_KEY`. 13 | 14 | Examples: 15 | 16 | ```bash 17 | # Full finetuning of all parameters 18 | cat | python inference.py --model_name --use_auditnlg 19 | # PEFT method 20 | cat | python inference.py --model_name --peft_model --use_auditnlg 21 | # prompt as parameter 22 | python inference.py --model_name --prompt_file --use_auditnlg 23 | ``` 24 | The folder contains test prompts for summarization use-case: 25 | ``` 26 | samsum_prompt.txt 27 | ... 28 | ``` 29 | 30 | **Note** 31 | Currently pad token by default in [HuggingFace Tokenizer is `None`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/tokenization_llama.py#L110). We add the padding token as a special token to the tokenizer, which in this case requires to resize the token_embeddings as shown below: 32 | 33 | ```python 34 | tokenizer.add_special_tokens( 35 | { 36 | 37 | "pad_token": "", 38 | } 39 | ) 40 | model.resize_token_embeddings(model.config.vocab_size + 1) 41 | ``` 42 | Padding would be required for batch inference. In this this [example](inference.py), batch size = 1 so essentially padding is not required. However,We added the code pointer as an example in case of batch inference. 43 | 44 | 45 | ## Chat completion 46 | The inference folder also includes a chat completion example, that adds built-in safety features in fine-tuned models to the prompt tokens. To run the example: 47 | 48 | ```bash 49 | python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json --quantization --use_auditnlg 50 | 51 | ``` 52 | 53 | ## Flash Attention and Xformer Memory Efficient Kernels 54 | 55 | Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up inference when used for batched inputs. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/). 56 | 57 | ```bash 58 | python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json --quantization --use_auditnlg --use_fast_kernels 59 | 60 | python inference.py --model_name --peft_model --prompt_file --use_auditnlg --use_fast_kernels 61 | 62 | ``` 63 | 64 | ## Loading back FSDP checkpoints 65 | 66 | In case you have fine-tuned your model with pure FSDP and saved the checkpoints with "SHARDED_STATE_DICT" as shown [here](../../../src/llama_recipes/configs/fsdp.py), you can use this converter script to convert the FSDP Sharded checkpoints into HuggingFace checkpoints. This enables you to use the inference script normally as mentioned above. 67 | **To convert the checkpoint use the following command**: 68 | 69 | This is helpful if you have fine-tuned you model using FSDP only as follows: 70 | 71 | ```bash 72 | torchrun --nnodes 1 --nproc_per_node 8 recipes/finetuning/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 73 | ``` 74 | Then convert your FSDP checkpoint to HuggingFace checkpoints using: 75 | ```bash 76 | python -m llama_recipes.inference.checkpoint_converter_fsdp_hf --fsdp_checkpoint_path PATH/to/FSDP/Checkpoints --consolidated_model_path PATH/to/save/checkpoints --HF_model_path_or_name PATH/or/HF/model_name 77 | 78 | # --HF_model_path_or_name specifies the HF Llama model name or path where it has config.json and tokenizer.json 79 | ``` 80 | By default, training parameter are saved in `train_params.yaml` in the path where FSDP checkpoints are saved, in the converter script we frist try to find the HugingFace model name used in the fine-tuning to load the model with configs from there, if not found user need to provide it. 81 | 82 | Then run inference using: 83 | 84 | ```bash 85 | python inference.py --model_name --prompt_file 86 | 87 | ``` -------------------------------------------------------------------------------- /recipes/inference/local_inference/chat_completion/chats.json: -------------------------------------------------------------------------------- 1 | [ 2 | [{"role": "user", "content": "what is the recipe of mayonnaise?"}], 3 | [ 4 | {"role": "user", "content": "I am going to Paris, what should I see?"}, 5 | { 6 | "role": "assistant", 7 | "content": "Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city. 2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa. 3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world." 8 | }, 9 | {"role": "user", "content": "What is so great about #1?"} 10 | ], 11 | [ 12 | {"role": "system", "content": "Always answer with Haiku"}, 13 | {"role": "user", "content": "I am going to Paris, what should I see?"} 14 | ], 15 | [ 16 | { 17 | "role": "system", 18 | "content": "Always answer with emojis" 19 | }, 20 | {"role": "user", "content": "How to go from Beijing to NY?"} 21 | ], 22 | [ 23 | { 24 | "role": "system", 25 | "content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." 26 | }, 27 | {"role": "user", "content": "Write a brief birthday message to John"} 28 | ] 29 | ] -------------------------------------------------------------------------------- /recipes/inference/local_inference/samsum_prompt.txt: -------------------------------------------------------------------------------- 1 | Summarize this dialog: 2 | A: Hi Tom, are you busy tomorrow’s afternoon? 3 | B: I’m pretty sure I am. What’s up? 4 | A: Can you go with me to the animal shelter?. 5 | B: What do you want to do? 6 | A: I want to get a puppy for my son. 7 | B: That will make him so happy. 8 | A: Yeah, we’ve discussed it many times. I think he’s ready now. 9 | B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 10 | A: I'll get him one of those little dogs. 11 | B: One that won't grow up too big;-) 12 | A: And eat too much;-)) 13 | B: Do you know which one he would like? 14 | A: Oh, yes, I took him there last Monday. He showed me one that he really liked. 15 | B: I bet you had to drag him away. 16 | A: He wanted to take it home right away ;-). 17 | B: I wonder what he'll name it. 18 | A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-))) 19 | --- 20 | Summary: -------------------------------------------------------------------------------- /recipes/inference/model_servers/README.md: -------------------------------------------------------------------------------- 1 | ## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md) 2 | This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps. 3 | 4 | \* To run a quantized Llama2 model on iOS and Android, you can use the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)). -------------------------------------------------------------------------------- /recipes/inference/model_servers/hf_text_generation_inference/README.md: -------------------------------------------------------------------------------- 1 | # Serving a fine tuned Llama model with HuggingFace text-generation-inference server 2 | 3 | This document shows how to serve a fine tuned Llama mode with HuggingFace's text-generation-inference server. This option is currently only available for models that were trained using the LoRA method or without using the `--use_peft` argument. 4 | 5 | ## Step 0: Merging the weights (Only required if LoRA method was used) 6 | 7 | In case the model was fine tuned with LoRA method we need to merge the weights of the base model with the adapter weight. For this we can use the script `merge_lora_weights.py` which is located in the same folder as this README file. 8 | 9 | The script takes the base model, the peft weight folder as well as an output as arguments: 10 | 11 | ``` 12 | python -m llama_recipes.inference.hf_text_generation_inference.merge_lora_weights --base_model llama-7B --peft_model ft_output --output_dir data/merged_model_output 13 | ``` 14 | 15 | ## Step 1: Serving the model 16 | Subsequently, the model can be served using the docker container provided by [hf text-generation-inference](https://github.com/huggingface/text-generation-inference) started from the main directory of this repository: 17 | 18 | ```bash 19 | model=/data/merged_model_output 20 | num_shard=2 21 | volume=$PWD/inference/hf-text-generation-inference/data 22 | docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard 23 | ``` 24 | 25 | The num_shard argument determines the number of GPU's the model should be sharded on. 26 | 27 | ## Step 2: Running inference 28 | After the loading of the model shards completed an inference can be executed by using one of the following commands: 29 | 30 | ```bash 31 | curl 127.0.0.1:8080/generate \ 32 | -X POST \ 33 | -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \ 34 | -H 'Content-Type: application/json' 35 | # OR for streaming inference 36 | curl 127.0.0.1:8080/generate_stream \ 37 | -X POST \ 38 | -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \ 39 | -H 'Content-Type: application/json' 40 | ``` 41 | 42 | Further information can be found in the documentation of the [hf text-generation-inference](https://github.com/huggingface/text-generation-inference) solution. 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /recipes/inference/model_servers/hf_text_generation_inference/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import fire 5 | import torch 6 | from peft import PeftModel 7 | from transformers import LlamaForCausalLM, LlamaTokenizer 8 | 9 | 10 | def main(base_model: str, 11 | peft_model: str, 12 | output_dir: str): 13 | 14 | model = LlamaForCausalLM.from_pretrained( 15 | base_model, 16 | load_in_8bit=False, 17 | torch_dtype=torch.float16, 18 | device_map="auto", 19 | offload_folder="tmp", 20 | ) 21 | 22 | tokenizer = LlamaTokenizer.from_pretrained( 23 | base_model 24 | ) 25 | 26 | model = PeftModel.from_pretrained( 27 | model, 28 | peft_model, 29 | torch_dtype=torch.float16, 30 | device_map="auto", 31 | offload_folder="tmp", 32 | ) 33 | 34 | model = model.merge_and_unload() 35 | model.save_pretrained(output_dir) 36 | tokenizer.save_pretrained(output_dir) 37 | 38 | 39 | if __name__ == "__main__": 40 | fire.Fire(main) -------------------------------------------------------------------------------- /recipes/inference/model_servers/vllm/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import fire 5 | 6 | import torch 7 | from vllm import LLM 8 | from vllm import LLM, SamplingParams 9 | from accelerate.utils import is_xpu_available 10 | 11 | if is_xpu_available(): 12 | torch.xpu.manual_seed(42) 13 | else: 14 | torch.cuda.manual_seed(42) 15 | 16 | torch.manual_seed(42) 17 | 18 | def load_model(model_name, tp_size=1): 19 | 20 | llm = LLM(model_name, tensor_parallel_size=tp_size) 21 | return llm 22 | 23 | def main( 24 | model, 25 | max_new_tokens=100, 26 | user_prompt=None, 27 | top_p=0.9, 28 | temperature=0.8 29 | ): 30 | while True: 31 | if user_prompt is None: 32 | user_prompt = input("Enter your prompt: ") 33 | 34 | print(f"User prompt:\n{user_prompt}") 35 | 36 | print(f"sampling params: top_p {top_p} and temperature {temperature} for this inference request") 37 | sampling_param = SamplingParams(top_p=top_p, temperature=temperature, max_tokens=max_new_tokens) 38 | 39 | 40 | outputs = model.generate(user_prompt, sampling_params=sampling_param) 41 | 42 | print(f"model output:\n {user_prompt} {outputs[0].outputs[0].text}") 43 | user_prompt = input("Enter next prompt (press Enter to exit): ") 44 | if not user_prompt: 45 | break 46 | 47 | def run_script( 48 | model_name: str, 49 | peft_model=None, 50 | tp_size=1, 51 | max_new_tokens=100, 52 | user_prompt=None, 53 | top_p=0.9, 54 | temperature=0.8 55 | ): 56 | model = load_model(model_name, tp_size) 57 | main(model, max_new_tokens, user_prompt, top_p, temperature) 58 | 59 | if __name__ == "__main__": 60 | fire.Fire(run_script) 61 | -------------------------------------------------------------------------------- /recipes/llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "47a9adb3", 6 | "metadata": {}, 7 | "source": [ 8 | "## This demo app shows how to query Llama 2 using the Gradio UI.\n", 9 | "\n", 10 | "Since we are using OctoAI in this example, you'll need to obtain an OctoAI token:\n", 11 | "\n", 12 | "- You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account\n", 13 | "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n", 14 | "\n", 15 | "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI.\n", 16 | "\n", 17 | "To run this example:\n", 18 | "- Run the notebook\n", 19 | "- Set up your OCTOAI API token and enter it when prompted\n", 20 | "- Enter your question and click Submit\n", 21 | "\n", 22 | "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer.\n", 23 | "\n", 24 | "Let's start by installing the necessary packages:\n", 25 | "- langchain provides necessary RAG tools for this demo\n", 26 | "- octoai-sdk allows us to use OctoAI Llama 2 endpoint\n", 27 | "- gradio is used for the UI elements\n", 28 | "\n", 29 | "And setting up the OctoAI token." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "6ae4f858-6ef7-49d9-b45b-1ef79d0217a0", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "!pip install langchain octoai-sdk gradio" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "3306c11d-ed82-41c5-a381-15fb5c07d307", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from getpass import getpass\n", 50 | "import os\n", 51 | "\n", 52 | "OCTOAI_API_TOKEN = getpass()\n", 53 | "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "928041cc", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from langchain.schema import AIMessage, HumanMessage\n", 64 | "import gradio as gr\n", 65 | "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n", 66 | "\n", 67 | "llama2_13b = \"llama-2-13b-chat-fp16\"\n", 68 | "\n", 69 | "llm = OctoAIEndpoint(\n", 70 | " endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n", 71 | " model_kwargs={\n", 72 | " \"model\": llama2_13b,\n", 73 | " \"messages\": [\n", 74 | " {\n", 75 | " \"role\": \"system\",\n", 76 | " \"content\": \"You are a helpful, respectful and honest assistant.\"\n", 77 | " }\n", 78 | " ],\n", 79 | " \"max_tokens\": 500,\n", 80 | " \"top_p\": 1,\n", 81 | " \"temperature\": 0.01\n", 82 | " },\n", 83 | ")\n", 84 | "\n", 85 | "\n", 86 | "def predict(message, history):\n", 87 | " history_langchain_format = []\n", 88 | " for human, ai in history:\n", 89 | " history_langchain_format.append(HumanMessage(content=human))\n", 90 | " history_langchain_format.append(AIMessage(content=ai))\n", 91 | " history_langchain_format.append(HumanMessage(content=message))\n", 92 | " llm_response = llm(message, history_langchain_format)\n", 93 | " return llm_response.content\n", 94 | "\n", 95 | "gr.ChatInterface(predict).launch()" 96 | ] 97 | } 98 | ], 99 | "metadata": { 100 | "kernelspec": { 101 | "display_name": "Python 3 (ipykernel)", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.11.6" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 5 120 | } 121 | -------------------------------------------------------------------------------- /recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/data/Llama Getting Started Guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/data/Llama Getting Started Guide.pdf -------------------------------------------------------------------------------- /recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio==4.16.0 2 | pypdf==4.0.0 3 | langchain==0.1.7 4 | sentence-transformers==2.2.2 5 | faiss-cpu==1.7.4 6 | text-generation==0.6.1 7 | octoai-sdk==0.8.3 -------------------------------------------------------------------------------- /recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/vectorstore/db_faiss/index.faiss: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/vectorstore/db_faiss/index.faiss -------------------------------------------------------------------------------- /recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/vectorstore/db_faiss/index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/vectorstore/db_faiss/index.pkl -------------------------------------------------------------------------------- /recipes/multilingual/extend_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code borrowed from https://github.com/ymcui/Chinese-LLaMA-Alpaca/blob/main/scripts/merge_tokenizer/merge_tokenizers.py 3 | """ 4 | 5 | import os 6 | import fire 7 | import re 8 | from transformers import LlamaTokenizer 9 | 10 | os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" 11 | from huggingface_hub import hf_hub_download 12 | from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model 13 | 14 | 15 | def main(new_tokenizer_path, extended_tokenizer_save_path): 16 | original_tokenizer_path = hf_hub_download(repo_id="meta-llama/Llama-2-7b-chat-hf", filename="tokenizer.model", local_dir="original_tokenizer") 17 | original_tokenizer_spm = sp_pb2_model.ModelProto() 18 | original_tokenizer_spm.ParseFromString(open(original_tokenizer_path, "rb").read()) 19 | new_tokenizer_spm = sp_pb2_model.ModelProto() 20 | new_tokenizer_spm.ParseFromString(open(os.path.join(new_tokenizer_path, "tokenizer.model"), "rb").read()) 21 | 22 | def contains_eng(text): 23 | eng_pattern = re.compile(r"[\u0020-\u007E]+") 24 | return True if eng_pattern.search(text) else False 25 | 26 | original_tokenizer_tokenset = set(p.piece for p in original_tokenizer_spm.pieces) 27 | print(f"Number of tokens before merge: {len(original_tokenizer_tokenset)}") 28 | for p in new_tokenizer_spm.pieces: 29 | piece = p.piece 30 | if piece not in original_tokenizer_tokenset and not contains_eng(piece): 31 | new_p = sp_pb2_model.ModelProto().SentencePiece() 32 | new_p.piece = piece 33 | new_p.score = 0 34 | original_tokenizer_spm.pieces.append(new_p) 35 | print(f"Number of tokens after merge: {len(original_tokenizer_spm.pieces)}") 36 | 37 | os.makedirs(extended_tokenizer_save_path, exist_ok=True) 38 | with open(os.path.join(extended_tokenizer_save_path, "tokenizer.model"), "wb") as f: 39 | f.write(original_tokenizer_spm.SerializeToString()) 40 | tokenizer = LlamaTokenizer(vocab_file=os.path.join(extended_tokenizer_save_path, "tokenizer.model"), legacy=False) 41 | tokenizer.save_pretrained(extended_tokenizer_save_path) 42 | print(f"Tokenizer saved to {extended_tokenizer_save_path}") 43 | 44 | # Verify that the extended tokenizer's English vocab matches with that of the original Llama tokenizer 45 | tok1 = LlamaTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf') 46 | tok2 = LlamaTokenizer.from_pretrained(extended_tokenizer_save_path) 47 | for i in range(len(tok1)): 48 | assert tok1.convert_ids_to_tokens(i) == tok2.convert_ids_to_tokens(i), f"Token mismatch at index {i}." 49 | 50 | 51 | if __name__ == "__main__": 52 | fire.Fire(main) -------------------------------------------------------------------------------- /recipes/multilingual/imgs/phase1-eval-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/multilingual/imgs/phase1-eval-loss.png -------------------------------------------------------------------------------- /recipes/multilingual/imgs/phase1-train-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/multilingual/imgs/phase1-train-loss.png -------------------------------------------------------------------------------- /recipes/multilingual/imgs/phase2-eval-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/multilingual/imgs/phase2-eval-loss.png -------------------------------------------------------------------------------- /recipes/multilingual/imgs/phase2-train-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/multilingual/imgs/phase2-train-loss.png -------------------------------------------------------------------------------- /recipes/multilingual/prepare_data.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import os 3 | from datasets import load_dataset 4 | 5 | DATASET = "rahular/varta" 6 | 7 | def main(split="validation", lang="hi", docs_to_sample=10_000, save_path="data"): 8 | dataset = load_dataset(DATASET, split=split, streaming=True) 9 | os.makedirs(save_path, exist_ok=True) 10 | with open(os.path.join(save_path, f"{lang}.txt"), "w") as f: 11 | count = 0 12 | for idx, d in enumerate(dataset): 13 | if idx % 10_000 == 0: 14 | print(f"Searched {idx} documents for {lang} documents. Found {count} documents.") 15 | if count >= docs_to_sample: 16 | break 17 | if d["langCode"] == lang: 18 | f.write(d["headline"] + "\n" + d["text"] + "\n") 19 | count += 1 20 | 21 | 22 | if __name__ == "__main__": 23 | fire.Fire(main) -------------------------------------------------------------------------------- /recipes/multilingual/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import os 3 | import sentencepiece as spm 4 | 5 | def main(data_file, save_path, vocab_size=16_000, num_threads=8): 6 | os.makedirs(save_path, exist_ok=True) 7 | tokenizer_name = os.path.join(save_path, "tokenizer") 8 | 9 | spm.SentencePieceTrainer.train( 10 | input=data_file, 11 | model_prefix=tokenizer_name, 12 | vocab_size=vocab_size, 13 | num_threads=num_threads, 14 | model_type="bpe", 15 | max_sentence_length=1073741824, 16 | shuffle_input_sentence="true", 17 | character_coverage=1.0, 18 | hard_vocab_limit="false", 19 | ) 20 | 21 | if __name__ == "__main__": 22 | fire.Fire(main) 23 | -------------------------------------------------------------------------------- /recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Running Llama 3 on Mac, Windows or Linux\n", 8 | "This notebook goes over how you can set up and run Llama 3 locally on a Mac, Windows or Linux using [Ollama](https://ollama.com/)." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Steps at a glance:\n", 16 | "1. Download and install Ollama.\n", 17 | "2. Download and test run Llama 3.\n", 18 | "3. Use local Llama 3 via Python.\n", 19 | "4. Use local Llama 3 via LangChain.\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### 1. Download and install Ollama\n", 27 | "\n", 28 | "On Mac or Windows, go to the Ollama download page [here](https://ollama.com/download) and select your platform to download it, then double click the downloaded file to install Ollama.\n", 29 | "\n", 30 | "On Linux, you can simply run on a terminal `curl -fsSL https://ollama.com/install.sh | sh` to download and install Ollama." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "#### 2. Download and test run Llama 3\n", 38 | "\n", 39 | "On a terminal or console, run `ollama pull llama3` to download the Llama 3 8b chat model, in the 4-bit quantized format with size about 4.7 GB.\n", 40 | "\n", 41 | "Run `ollama pull llama3:70b` to download the Llama 3 70b chat model, also in the 4-bit quantized format with size 39GB.\n", 42 | "\n", 43 | "Then you can run `ollama run llama3` and ask Llama 3 questions such as \"who wrote the book godfather?\" or \"who wrote the book godfather? answer in one sentence.\" You can also try `ollama run llama3:70b`, but the inference speed will most likely be too slow - for example, on an Apple M1 Pro with 32GB RAM, it takes over 10 seconds to generate one token using Llama 3 70b chat (vs over 10 tokens per second with Llama 3 8b chat).\n", 44 | "\n", 45 | "You can also run the following command to test Llama 3 8b chat:\n", 46 | "```\n", 47 | " curl http://localhost:11434/api/chat -d '{\n", 48 | " \"model\": \"llama3\",\n", 49 | " \"messages\": [\n", 50 | " {\n", 51 | " \"role\": \"user\",\n", 52 | " \"content\": \"who wrote the book godfather?\"\n", 53 | " }\n", 54 | " ],\n", 55 | " \"stream\": false\n", 56 | "}'\n", 57 | "```\n", 58 | "\n", 59 | "The complete Ollama API doc is [here](https://github.com/ollama/ollama/blob/main/docs/api.md)." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "#### 3. Use local Llama 3 via Python\n", 67 | "\n", 68 | "The Python code below is the port of the curl command above." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "import requests\n", 78 | "import json\n", 79 | "\n", 80 | "url = \"http://localhost:11434/api/chat\"\n", 81 | "\n", 82 | "def llama3(prompt):\n", 83 | " data = {\n", 84 | " \"model\": \"llama3\",\n", 85 | " \"messages\": [\n", 86 | " {\n", 87 | " \"role\": \"user\",\n", 88 | " \"content\": prompt\n", 89 | " }\n", 90 | " ],\n", 91 | " \"stream\": False\n", 92 | " }\n", 93 | " \n", 94 | " headers = {\n", 95 | " 'Content-Type': 'application/json'\n", 96 | " }\n", 97 | " \n", 98 | " response = requests.post(url, headers=headers, json=data)\n", 99 | " \n", 100 | " return(response.json()['message']['content'])" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "response = llama3(\"who wrote the book godfather\")\n", 110 | "print(response)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "#### 4. Use local Llama 3 via LangChain\n", 118 | "\n", 119 | "Code below use LangChain with Ollama to query Llama 3 running locally. For a more advanced example of using local Llama 3 with LangChain and agent-powered RAG, see [this](https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb)." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "!pip install langchain" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from langchain_community.chat_models import ChatOllama\n", 138 | "\n", 139 | "llm = ChatOllama(model=\"llama3\", temperature=0)\n", 140 | "response = llm.invoke(\"who wrote the book godfather?\")\n", 141 | "print(response.content)\n" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3 (ipykernel)", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.11.9" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 4 166 | } 167 | -------------------------------------------------------------------------------- /recipes/responsible_ai/README.md: -------------------------------------------------------------------------------- 1 | # Meta Llama Guard 2 | 3 | Meta Llama Guard and Meta Llama Guard 2 are new models that provide input and output guardrails for LLM inference. For more details, please visit the main [repository](https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard2). 4 | 5 | **Note** Please find the right model on HF side [here](https://huggingface.co/meta-llama/Meta-Llama-Guard-2-8B). 6 | 7 | ### Running locally 8 | The [llama_guard](llama_guard) folder contains the inference script to run Meta Llama Guard locally. Add test prompts directly to the [inference script](llama_guard/inference.py) before running it. 9 | 10 | ### Running on the cloud 11 | The notebooks [Purple_Llama_Anyscale](Purple_Llama_Anyscale.ipynb) & [Purple_Llama_OctoAI](Purple_Llama_OctoAI.ipynb) contain examples for running Meta Llama Guard on cloud hosted endpoints. -------------------------------------------------------------------------------- /recipes/responsible_ai/llama_guard/README.md: -------------------------------------------------------------------------------- 1 | # Meta Llama Guard demo 2 | 3 | Meta Llama Guard is a language model that provides input and output guardrails for LLM inference. For more details and model cards, please visit the main repository for each model, [Meta Llama Guard](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard) and Meta [Llama Guard 2](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard2). 4 | 5 | This folder contains an example file to run inference with a locally hosted model, either using the Hugging Face Hub or a local path. 6 | 7 | ## Requirements 8 | 1. Access to Llama guard model weights on Hugging Face. To get access, follow the steps described [here](https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard#download) 9 | 2. Llama recipes package and it's dependencies [installed](https://github.com/meta-llama/llama-recipes?tab=readme-ov-file#installing) 10 | 11 | 12 | ## Llama Guard inference script 13 | For testing, you can add User or User/Agent interactions into the prompts list and the run the script to verify the results. When the conversation has one or more Agent responses, it's considered of type agent. 14 | 15 | 16 | ``` 17 | prompts: List[Tuple[List[str], AgentType]] = [ 18 | ([""], AgentType.USER), 19 | 20 | (["", 21 | ""], AgentType.AGENT), 22 | 23 | (["", 24 | "", 25 | "", 26 | "",], AgentType.AGENT), 27 | 28 | ] 29 | ``` 30 | The complete prompt is built with the `build_custom_prompt` function, defined in [prompt_format.py](../../../src/llama_recipes/inference/prompt_format_utils.py). The file contains the default Meta Llama Guard categories. These categories can adjusted and new ones can be added, as described in the [research paper](https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/), on section 4.5 Studying the adaptability of the model. 31 | 32 | 33 | To run the samples, with all the dependencies installed, execute this command: 34 | 35 | `python recipes/responsible_ai/llama_guard/inference.py` 36 | 37 | This is the output: 38 | 39 | ``` 40 | [''] 41 | > safe 42 | 43 | ================================== 44 | 45 | ['', ''] 46 | > safe 47 | 48 | ================================== 49 | 50 | ['', '', '', ''] 51 | > safe 52 | 53 | ================================== 54 | ``` 55 | 56 | To run it with a local model, you can use the `model_id` param in the inference script: 57 | 58 | `python recipes/responsible_ai/llama_guard/inference.py --model_id=/home/ubuntu/models/llama3/llama_guard_2-hf/ --llama_guard_version=LLAMA_GUARD_2` 59 | 60 | Note: Make sure to also add the llama_guard_version if when it does not match the default, the script allows you to run the prompt format from Meta Llama Guard 1 on Meta Llama Guard 2 61 | 62 | ## Inference Safety Checker 63 | When running the regular inference script with prompts, Meta Llama Guard will be used as a safety checker on the user prompt and the model output. If both are safe, the result will be shown, else a message with the error will be shown, with the word unsafe and a comma separated list of categories infringed. Meta Llama Guard is always loaded quantized using Hugging Face Transformers library with bitsandbytes. 64 | 65 | In this case, the default categories are applied by the tokenizer, using the `apply_chat_template` method. 66 | 67 | Use this command for testing with a quantized Llama model, modifying the values accordingly: 68 | 69 | `python examples/inference.py --model_name --prompt_file --quantization --enable_llamaguard_content_safety` 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /recipes/responsible_ai/llama_guard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | -------------------------------------------------------------------------------- /recipes/responsible_ai/llama_guard/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import fire 5 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig 6 | 7 | 8 | from llama_recipes.inference.prompt_format_utils import build_default_prompt, create_conversation, LlamaGuardVersion 9 | from typing import List, Tuple 10 | from enum import Enum 11 | 12 | class AgentType(Enum): 13 | AGENT = "Agent" 14 | USER = "User" 15 | 16 | def main( 17 | model_id: str = "meta-llama/LlamaGuard-7b", 18 | llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_1 19 | ): 20 | """ 21 | Entry point for Llama Guard inference sample script. 22 | 23 | This function loads Llama Guard from Hugging Face or a local model and 24 | executes the predefined prompts in the script to showcase how to do inference with Llama Guard. 25 | 26 | Args: 27 | model_id (str): The ID of the pretrained model to use for generation. This can be either the path to a local folder containing the model files, 28 | or the repository ID of a model hosted on the Hugging Face Hub. Defaults to 'meta-llama/LlamaGuard-7b'. 29 | llama_guard_version (LlamaGuardVersion): The version of the Llama Guard model to use for formatting prompts. Defaults to LLAMA_GUARD_1. 30 | """ 31 | try: 32 | llama_guard_version = LlamaGuardVersion[llama_guard_version] 33 | except KeyError as e: 34 | raise ValueError(f"Invalid Llama Guard version '{llama_guard_version}'. Valid values are: {', '.join([lgv.name for lgv in LlamaGuardVersion])}") from e 35 | 36 | prompts: List[Tuple[List[str], AgentType]] = [ 37 | ([""], AgentType.USER), 38 | 39 | (["", 40 | ""], AgentType.AGENT), 41 | 42 | (["", 43 | "", 44 | "", 45 | "",], AgentType.AGENT), 46 | 47 | ] 48 | 49 | quantization_config = BitsAndBytesConfig(load_in_8bit=True) 50 | 51 | tokenizer = AutoTokenizer.from_pretrained(model_id) 52 | model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto") 53 | 54 | for prompt in prompts: 55 | formatted_prompt = build_default_prompt( 56 | prompt[1], 57 | create_conversation(prompt[0]), 58 | llama_guard_version) 59 | 60 | 61 | input = tokenizer([formatted_prompt], return_tensors="pt").to("cuda") 62 | prompt_len = input["input_ids"].shape[-1] 63 | output = model.generate(**input, max_new_tokens=100, pad_token_id=0) 64 | results = tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True) 65 | 66 | 67 | print(prompt[0]) 68 | print(f"> {results}") 69 | print("\n==================================\n") 70 | 71 | if __name__ == "__main__": 72 | try: 73 | fire.Fire(main) 74 | except Exception as e: 75 | print(e) -------------------------------------------------------------------------------- /recipes/use_cases/README.md: -------------------------------------------------------------------------------- 1 | ## [VideoSummary](VideoSummary.ipynb): Ask Llama 3 to Summarize a Long YouTube Video (using Replicate or [OctoAI](../llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb)) 2 | This demo app uses Llama 3 to return a text summary of a YouTube video. It shows how to retrieve the caption of a YouTube video and how to ask Llama to summarize the content in different ways, from the simplest naive way that works for short text to more advanced methods of using LangChain's map_reduce and refine to overcome the 8K context length limit of Llama 3. 3 | 4 | ## [NBA2023-24](./text2sql/StructuredLlama.ipynb): Ask Llama 3 about Structured Data 5 | This demo app shows how to use LangChain and Llama 3 to let users ask questions about **structured** data stored in a SQL DB. As the 2023-24 NBA season is entering the playoff, we use the NBA roster info saved in a SQLite DB to show you how to ask Llama 3 questions about your favorite teams or players. 6 | 7 | ## [LiveData](LiveData.ipynb): Ask Llama 3 about Live Data (using Replicate or [OctoAI](../llama_api_providers/OctoAI_API_examples/LiveData.ipynb)) 8 | This demo app shows how to perform live data augmented generation tasks with Llama 3, [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps, and the [Tavily](https://tavily.com) live search API. 9 | 10 | ## [WhatsApp Chatbot](./chatbots/whatsapp_llama/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot 11 | This step-by-step tutorial shows how to use the [WhatsApp Business API](https://developers.facebook.com/docs/whatsapp/cloud-api/overview) to build a Llama 3 enabled WhatsApp chatbot. 12 | 13 | ## [Messenger Chatbot](./chatbots/messenger_llama/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot 14 | This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot. 15 | 16 | ### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb)) 17 | A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note). -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/use_cases/chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/RAG_chatbot/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | pypdf 3 | langchain 4 | sentence-transformers 5 | faiss-cpu 6 | text-generation -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/messenger_llama/llama_messenger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement. 3 | 4 | import langchain 5 | from langchain.llms import Replicate 6 | 7 | from flask import Flask 8 | from flask import request 9 | import os 10 | import requests 11 | import json 12 | 13 | os.environ["REPLICATE_API_TOKEN"] = "" 14 | llama3_8b_chat = "meta/meta-llama-3-8b-instruct" 15 | 16 | llm = Replicate( 17 | model=llama3_8b_chat, 18 | model_kwargs={"temperature": 0.0, "top_p": 1, "max_new_tokens":500} 19 | ) 20 | 21 | app = Flask(__name__) 22 | 23 | @app.route('/msgrcvd_pager', methods=['POST', 'GET']) 24 | def msgrcvd_pager(): 25 | message = request.args.get('message') 26 | sender = request.args.get('sender') 27 | recipient = request.args.get('recipient') 28 | 29 | answer = llm(message) 30 | print(message) 31 | print(answer) 32 | 33 | url = f"https://graph.facebook.com/v18.0/{recipient}/messages" 34 | params = { 35 | 'recipient': '{"id": ' + sender + '}', 36 | 'message': json.dumps({'text': answer}), 37 | 'messaging_type': 'RESPONSE', 38 | 'access_token': "" 39 | } 40 | headers = { 41 | 'Content-Type': 'application/json' 42 | } 43 | response = requests.post(url, params=params, headers=headers) 44 | print(response.status_code) 45 | print(response.text) 46 | 47 | return message + "

" + answer 48 | 49 | -------------------------------------------------------------------------------- /recipes/use_cases/chatbots/whatsapp_llama/llama_chatbot.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement. 3 | 4 | import langchain 5 | from langchain.llms import Replicate 6 | 7 | from flask import Flask 8 | from flask import request 9 | import os 10 | import requests 11 | import json 12 | 13 | class WhatsAppClient: 14 | 15 | API_URL = "https://graph.facebook.com/v17.0/" 16 | WHATSAPP_API_TOKEN = "" 17 | WHATSAPP_CLOUD_NUMBER_ID = "" 18 | 19 | def __init__(self): 20 | self.headers = { 21 | "Authorization": f"Bearer {self.WHATSAPP_API_TOKEN}", 22 | "Content-Type": "application/json", 23 | } 24 | self.API_URL = self.API_URL + self.WHATSAPP_CLOUD_NUMBER_ID 25 | 26 | def send_text_message(self,message, phone_number): 27 | payload = { 28 | "messaging_product": 'whatsapp', 29 | "to": phone_number, 30 | "type": "text", 31 | "text": { 32 | "preview_url": False, 33 | "body": message 34 | } 35 | } 36 | response = requests.post(f"{self.API_URL}/messages", json=payload,headers=self.headers) 37 | print(response.status_code) 38 | assert response.status_code == 200, "Error sending message" 39 | return response.status_code 40 | 41 | os.environ["REPLICATE_API_TOKEN"] = "" 42 | llama3_8b_chat = "meta/meta-llama-3-8b-instruct" 43 | 44 | llm = Replicate( 45 | model=llama3_8b_chat, 46 | model_kwargs={"temperature": 0.0, "top_p": 1, "max_new_tokens":500} 47 | ) 48 | client = WhatsAppClient() 49 | app = Flask(__name__) 50 | 51 | @app.route("/") 52 | def hello_llama(): 53 | return "

Hello Llama 3

" 54 | 55 | @app.route('/msgrcvd', methods=['POST', 'GET']) 56 | def msgrcvd(): 57 | message = request.args.get('message') 58 | answer = llm(message) 59 | print(message) 60 | print(answer) 61 | client.send_text_message(llm(message), "") 62 | return message + "

" + answer 63 | 64 | -------------------------------------------------------------------------------- /recipes/use_cases/text2sql/csv2db.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import sqlite3 5 | import csv 6 | 7 | # Define the input CSV file and the SQLite database file 8 | input_csv = 'nba_roster.csv' 9 | database_file = 'nba_roster.db' 10 | 11 | # Connect to the SQLite database 12 | conn = sqlite3.connect(database_file) 13 | cursor = conn.cursor() 14 | 15 | # Create a table to store the data 16 | cursor.execute('''CREATE TABLE IF NOT EXISTS nba_roster ( 17 | Team TEXT, 18 | NAME TEXT, 19 | Jersey TEXT, 20 | POS TEXT, 21 | AGE INT, 22 | HT TEXT, 23 | WT TEXT, 24 | COLLEGE TEXT, 25 | SALARY TEXT 26 | )''') 27 | 28 | # Read data from the CSV file and insert it into the SQLite table 29 | with open(input_csv, 'r', newline='') as csvfile: 30 | csv_reader = csv.reader(csvfile) 31 | next(csv_reader) # Skip the header row 32 | 33 | for row in csv_reader: 34 | cursor.execute('INSERT INTO nba_roster VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', row) 35 | 36 | # Commit the changes and close the database connection 37 | conn.commit() 38 | conn.close() 39 | 40 | print(f'Data from {input_csv} has been successfully imported into {database_file}') 41 | 42 | -------------------------------------------------------------------------------- /recipes/use_cases/text2sql/nba_roster.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plstory/DS2D/d3f734bd465d6239100b50bb11b3903741a2be08/recipes/use_cases/text2sql/nba_roster.db -------------------------------------------------------------------------------- /recipes/use_cases/text2sql/txt2csv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import csv 5 | 6 | # Define the input and output file names 7 | input_file = 'nba.txt' 8 | output_file = 'nba_roster.csv' 9 | 10 | # Initialize lists to store data 11 | roster_data = [] 12 | current_team = None 13 | 14 | # Open the input file 15 | with open(input_file, 'r') as file: 16 | for line in file: 17 | # Remove leading and trailing whitespaces from the line 18 | line = line.strip() 19 | 20 | # Check if the line starts with 'https', skip it 21 | if line.startswith('https'): 22 | continue 23 | 24 | # Check if the line contains the team name 25 | if 'Roster' in line: 26 | current_team = line.split(' Roster ')[0] 27 | elif line and "NAME" not in line: # Skip empty lines and header lines 28 | # Split the line using tabs as the delimiter 29 | player_info = line.split('\t') 30 | 31 | # Remove any numbers from the player's name and set Jersey accordingly 32 | name = ''.join([c for c in player_info[0] if not c.isdigit()]) 33 | jersey = ''.join([c for c in player_info[0] if c.isdigit()]) 34 | 35 | # If no number found, set Jersey to "NA" 36 | if not jersey: 37 | jersey = "NA" 38 | 39 | # Append the team name, name, and jersey to the player's data 40 | player_info = [current_team, name, jersey] + player_info[1:] 41 | 42 | # Append the player's data to the roster_data list 43 | roster_data.append(player_info) 44 | 45 | # Write the data to a CSV file 46 | with open(output_file, 'w', newline='') as csvfile: 47 | writer = csv.writer(csvfile) 48 | 49 | # Write the header row 50 | writer.writerow(['Team', 'NAME', 'Jersey', 'POS', 'AGE', 'HT', 'WT', 'COLLEGE', 'SALARY']) 51 | 52 | # Write the player data 53 | writer.writerows(roster_data) 54 | 55 | print(f'Conversion completed. Data saved to {output_file}') 56 | 57 | -------------------------------------------------------------------------------- /requirements_llama3.txt: -------------------------------------------------------------------------------- 1 | torch>=2.2 2 | accelerate 3 | appdirs 4 | loralib 5 | bitsandbytes 6 | black 7 | black[jupyter] 8 | datasets 9 | fire 10 | peft 11 | transformers>=4.40.0 12 | sentencepiece 13 | py7zr 14 | scipy 15 | optimum 16 | matplotlib 17 | gradio 18 | chardet 19 | openai 20 | typing-extensions==4.8.0 21 | tabulate 22 | -------------------------------------------------------------------------------- /rplan_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datasets import load_from_disk, DatasetDict 3 | import datasets 4 | import numpy as np 5 | import random 6 | room_label = { 7 | 0: "LivingRoom", 8 | 1: "MasterRoom", 9 | 2: "Kitchen", 10 | 3: "Bathroom", 11 | 4: "DiningRoom", 12 | 5: "ChildRoom", 13 | 6: "StudyRoom", 14 | 7: "SecondRoom", 15 | 8: "GuestRoom", 16 | 9: "Balcony", 17 | 10: "Entrance", 18 | 11: "Storage", 19 | 12: "Wall-in", 20 | 13: "External", 21 | 14: "ExteriorWall", 22 | 15: "FrontDoor", 23 | 16: "InteriorWall", 24 | 17: "InteriorDoor", 25 | } 26 | 27 | def get_custom_dataset(dataset_config, tokenizer, split, testing=False): 28 | exprm = int(dataset_config.exprm[:1]) 29 | ds_dir = 'datasets/rplan_converted/' 30 | dd = [] 31 | for idx in [5,6,7,8]: 32 | if idx == exprm: 33 | continue 34 | dd.append(load_from_disk(f'{ds_dir}{idx}')) 35 | dataset = DatasetDict() 36 | for key in dd[0]: 37 | dataset[key] = datasets.concatenate_datasets([ddd[key] for ddd in dd]) 38 | 39 | 40 | if split == 'validation': 41 | split = 'test' 42 | dataset = dataset[split] 43 | 44 | pixel2len = 18/256 45 | pixel2area = pixel2len**2 46 | 47 | def process_sample(data): 48 | if str(dataset_config.exprm).find('new') == -1: 49 | num_rooms = len(data['rooms']) 50 | json_str = f'{{"rooms": [' 51 | for room_idx, room_info in enumerate(data['rooms']): 52 | json_str += f'{{"room_type": "{room_label[room_info[-2]]}", ' 53 | json_str += '"floor_polygon": [' 54 | for x,y in data['polygons'][room_idx]: 55 | json_str += f'{{"x": {x}, "z": {y}}}, ' 56 | json_str = json_str.strip(', ') + '], ' 57 | json_str += f'"id": "room|{room_idx}"}}, ' 58 | json_str = json_str.strip(', ') + ']}' 59 | else: 60 | num_rooms = len(data['rooms']) 61 | total_area = 0 62 | room_types = [] 63 | json_str = f'"rooms": [' 64 | for room_idx, room_info in enumerate(data['rooms']): 65 | y0,x0,y1,x1,c1,c2,area, height, width = room_info 66 | total_area += area 67 | json_str += f'{{"area": {area*pixel2area:.2f}, ' 68 | json_str += f'"room_type": "{room_label[c1]}", ' 69 | room_types.append(room_label[c1]) 70 | json_str += '"floor_polygon": [' 71 | for x,y in data['polygons'][room_idx]: 72 | json_str += f'{{"x": {x}, "z": {y}}}, ' 73 | json_str = json_str.strip(', ') + '], ' 74 | json_str += f'"height": {height*pixel2len:.2f}, ' 75 | json_str += f'"width": {width*pixel2len:.2f}, ' 76 | json_str += f'"id": "room|{room_idx}"}}, ' 77 | json_str = json_str.strip(', ') + ']}' 78 | json_str = f'{{"room_count": {len(data["rooms"])}, "total_area": {total_area*pixel2area:.2f}, "room_types": {room_types}, ' + json_str 79 | json_str = json_str.strip(', ') 80 | json_str = json_str.replace("'",'"') 81 | 82 | prompt_d={} 83 | prompt_d = json.loads(json_str.replace("'",'"')) 84 | for room_dict in prompt_d['rooms']: 85 | del room_dict['floor_polygon'] 86 | for k in list(room_dict.keys()): 87 | if random.random() < 0.5: 88 | del room_dict[k] 89 | if len(room_dict.keys()) == 0: 90 | del room_dict 91 | if len(prompt_d['rooms']) == 0: 92 | del prompt_d['rooms'] 93 | rands = np.random.random(len(prompt_d.keys())) 94 | rands[np.argmax(rands)] = 1.0 95 | for idx, k in enumerate(list(prompt_d.keys())): 96 | if rands[idx] < 0.5: 97 | del prompt_d[k] 98 | 99 | instruction_str = 'you are to generate a floor plan in a JSON structure. you have to satisfy the adjacency constraints given as pairs of neighboring rooms; two connecting rooms are presented as (room_type1 room_id1, room_type2 room_id2). you also need to satisfy additional contraints given by the user.' 100 | adjacency_str = f'total number of rooms: {num_rooms}; adjacency pairs: ' 101 | for u,v,_ in data['edges']: 102 | type_u = room_label[data['rooms'][u][4]] 103 | type_v = room_label[data['rooms'][v][4]] 104 | id_u = f"room|{u}" 105 | id_v = f"room|{v}" 106 | adjacency_str += f'({type_u} = "{id_u}", {type_v} = "{id_v}"), ' 107 | adjacency_str = adjacency_str.strip(', ') 108 | user_str = adjacency_str 109 | 110 | if len(prompt_d.keys())>0: 111 | user_str += f'. additional constraints: {str(prompt_d)}' 112 | 113 | prompt_str = f"""<|start_header_id|>system<|end_header_id|> {instruction_str}<|eot_id|><|start_header_id|>user<|end_header_id|> {user_str}<|eot_id|><|start_header_id|>assistant<|end_header_id|> """ 114 | prompt = tokenizer(f"{tokenizer.bos_token}{prompt_str}", add_special_tokens=False) 115 | floorplan = tokenizer(f"{json_str}{tokenizer.eos_token}", add_special_tokens=False) 116 | 117 | input_ids = prompt['input_ids'] + floorplan['input_ids'] 118 | attention_mask = [1] * (len(prompt['input_ids']) + len(floorplan['input_ids'])) 119 | labels = [-100] * len(prompt['input_ids']) + floorplan['input_ids'] 120 | 121 | return { 122 | 'input_ids': input_ids, 123 | 'attention_mask': attention_mask, 124 | 'labels': labels 125 | } 126 | 127 | return dataset.map( 128 | process_sample, 129 | remove_columns=list(dataset.features) 130 | ) 131 | 132 | if __name__ == '__main__': 133 | get_custom_dataset({'exprm':4}, None, 'train') -------------------------------------------------------------------------------- /run_generation_procthor.py: -------------------------------------------------------------------------------- 1 | from src.pred import predict_outputs, predict_outputs_multiple 2 | from src.pred import load_model, load_dataset 3 | import os 4 | import numpy as np 5 | import json 6 | import sys 7 | import argparse 8 | 9 | def filter_key_in_list(dicts, filter_out='prompt'): 10 | return [{key: value for key, value in d.items() if key != filter_out} for d in dicts] 11 | 12 | def main(args): 13 | 14 | jobid = os.getenv('SLURM_ARRAY_TASK_ID') 15 | num_samples = args.num_samples 16 | version = args.version 17 | exprm_search = ['full_prompt','mask','preset_mask'] 18 | if jobid is not None: 19 | jobid = int(jobid) 20 | exprm = exprm_search[jobid%3] 21 | if num_samples == 1: 22 | start_idx = 200 * (jobid//3) 23 | end_idx = start_idx + 200 24 | elif num_samples > 1: 25 | start_idx = 20 * (jobid//3) 26 | end_idx = start_idx + 20 27 | else: 28 | start_idx = 0 29 | end_idx = 100 30 | if num_samples == 1: 31 | end_idx = 1000 32 | exprm = args.exprm 33 | 34 | print(f'exprm: {exprm}, num_samples: {num_samples}!!') 35 | print(f'exprm: {exprm}, num_samples: {num_samples}!!') 36 | print(f'exprm: {exprm}, num_samples: {num_samples}!!') 37 | print(f'exprm: {exprm}, num_samples: {num_samples}!!') 38 | 39 | if version == 'bd': 40 | model_dir = "models/procthor_weights_BD_variants/" 41 | else: 42 | model_dir = "models/procthor_weights_nonBD_variants/" 43 | 44 | model, tokenizer = load_model(model_dir=model_dir,exprm=exprm) 45 | #use validation set here because test set was used for validation, just naming difference. 46 | test_dataset = load_dataset(dataset_name="datasets/procthor_converted",split="validation") 47 | np.random.seed(12345) 48 | idx_select = np.random.permutation(len(test_dataset))[start_idx:end_idx] 49 | test_dataset = test_dataset.select(idx_select) 50 | 51 | if num_samples > 1: 52 | result_dir = f'generations/procthor_{version}_sampling' 53 | else: 54 | result_dir = f'generations/procthor_{version}_greedy' 55 | 56 | predict_outputs_multiple(model, tokenizer, test_dataset, exprm, num_samples=num_samples,prompt_style={version}, result_dir=result_dir, start_idx=start_idx, end_idx=end_idx) 57 | 58 | def parse_arguments(): 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument('--exprm',type=str,help='model variant',default='dropout') 61 | parser.add_argument('--num_samples',type=int,help='number of samples to generate',default=1) 62 | parser.add_argument('--version',type=str,help='version of procthor model is trained on, "bd" or "nonbd"',default='bd') 63 | args = parser.parse_args() 64 | return args 65 | 66 | if __name__ == '__main__': 67 | args = parse_arguments() 68 | main(args) -------------------------------------------------------------------------------- /run_metric.py: -------------------------------------------------------------------------------- 1 | import sys 2 | eval_path = sys.argv[1] 3 | 4 | from src.utils import FloorplansAndPromptEvaluation, Evaluate 5 | 6 | overall_evaluation = Evaluate(eval_path, 7 | metrics='all', 8 | experiment_list='all', 9 | if_separate_num_room_results=False) 10 | overall_evaluation.evaluate() -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics import * 2 | from .utils import * 3 | from .pred import * -------------------------------------------------------------------------------- /src/llama_recipes/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from llama_recipes.configs.peft import lora_config, llama_adapter_config, prefix_config 5 | from llama_recipes.configs.fsdp import fsdp_config 6 | from llama_recipes.configs.training import train_config 7 | from llama_recipes.configs.wandb import wandb_config 8 | -------------------------------------------------------------------------------- /src/llama_recipes/configs/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from dataclasses import dataclass 5 | 6 | 7 | @dataclass 8 | class samsum_dataset: 9 | dataset: str = "samsum_dataset" 10 | train_split: str = "train" 11 | test_split: str = "validation" 12 | 13 | 14 | @dataclass 15 | class grammar_dataset: 16 | dataset: str = "grammar_dataset" 17 | train_split: str = "src/llama_recipes/datasets/grammar_dataset/gtrain_10k.csv" 18 | test_split: str = "src/llama_recipes/datasets/grammar_dataset/grammar_validation.csv" 19 | 20 | 21 | @dataclass 22 | class alpaca_dataset: 23 | dataset: str = "alpaca_dataset" 24 | train_split: str = "train" 25 | test_split: str = "val" 26 | data_path: str = "src/llama_recipes/datasets/alpaca_data.json" 27 | 28 | 29 | @dataclass 30 | class custom_dataset: 31 | dataset: str = "custom_dataset" 32 | file: str = "examples/custom_dataset.py" 33 | train_split: str = "train" 34 | test_split: str = "validation" 35 | exprm: str = 'none' 36 | ds_version: str = '6' -------------------------------------------------------------------------------- /src/llama_recipes/configs/fsdp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from dataclasses import dataclass 5 | 6 | from torch.distributed.fsdp import ShardingStrategy 7 | from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType 8 | 9 | @dataclass 10 | class fsdp_config: 11 | mixed_precision: bool=True 12 | use_fp16: bool=False 13 | sharding_strategy: ShardingStrategy = ShardingStrategy.FULL_SHARD # HYBRID_SHARD "Full Shard within a node DDP cross Nodes", SHARD_GRAD_OP "Shard only Gradients and Optimizer States", NO_SHARD "Similar to DDP". 14 | hsdp : bool =False # Require HYBRID_SHARD to be set. This flag can extend the HYBRID_SHARD by allowing sharding a model on customized number of GPUs (Sharding_group) and Replicas over Sharding_group. 15 | sharding_group_size : int=0 # requires hsdp to be set. This specifies the sharding group size, number of GPUs that you model can fit into to form a replica of a model. 16 | replica_group_size: int=0 #requires hsdp to be set. This specifies the replica group size, which is world_size/sharding_group_size. 17 | checkpoint_type: StateDictType = StateDictType.SHARDED_STATE_DICT # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. 18 | fsdp_activation_checkpointing: bool=True 19 | fsdp_cpu_offload: bool=False 20 | pure_bf16: bool = False 21 | optimizer: str= "AdamW" 22 | 23 | -------------------------------------------------------------------------------- /src/llama_recipes/configs/peft.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from dataclasses import dataclass, field 5 | from typing import List 6 | 7 | @dataclass 8 | class lora_config: 9 | r: int=8 10 | lora_alpha: int=32 11 | target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"]) 12 | bias= "none" 13 | task_type: str= "CAUSAL_LM" 14 | lora_dropout: float=0.05 15 | inference_mode: bool = False 16 | 17 | @dataclass 18 | class llama_adapter_config: 19 | adapter_len: int= 10 20 | adapter_layers: int= 30 21 | task_type: str= "CAUSAL_LM" 22 | 23 | @dataclass 24 | class prefix_config: 25 | num_virtual_tokens: int=30 26 | task_type: str= "CAUSAL_LM" -------------------------------------------------------------------------------- /src/llama_recipes/configs/training.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from dataclasses import dataclass 5 | 6 | 7 | @dataclass 8 | class train_config: 9 | model_name: str="PATH/to/Model" 10 | tokenizer_name: str=None 11 | enable_fsdp: bool=False 12 | low_cpu_fsdp: bool=False 13 | run_validation: bool=True 14 | batch_size_training: int=4 15 | batching_strategy: str="packing" #alternative: padding 16 | context_length: int=4096 17 | gradient_accumulation_steps: int=1 18 | gradient_clipping: bool = False 19 | gradient_clipping_threshold: float = 1.0 20 | num_epochs: int=3 21 | max_train_step: int=0 22 | max_eval_step: int=0 23 | num_workers_dataloader: int=1 24 | lr: float=1e-4 25 | weight_decay: float=0.0 26 | gamma: float= 0.85 27 | seed: int=42 28 | use_fp16: bool=False 29 | mixed_precision: bool=True 30 | val_batch_size: int=1 31 | dataset = "samsum_dataset" 32 | peft_method: str = "lora" # None,llama_adapter, prefix 33 | use_peft: bool=False 34 | load_peft: bool=False 35 | output_dir: str = "PATH/to/save/PEFT/model" 36 | freeze_layers: bool = False 37 | num_freeze_layers: int = 1 38 | quantization: bool = False 39 | one_gpu: bool = False 40 | save_model: bool = True 41 | dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP 42 | dist_checkpoint_folder: str="fine-tuned" # will be used if using FSDP 43 | save_optimizer: bool=False # will be used if using FSDP 44 | use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels 45 | use_wandb: bool = False # Enable wandb for experient tracking 46 | save_metrics: bool = False # saves training metrics to a json file for later plotting 47 | flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time. 48 | flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops. 49 | use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time. 50 | profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler 51 | # exprm_str: str = "num_room" # The experiment name needed to decide how to process the dataset -------------------------------------------------------------------------------- /src/llama_recipes/configs/wandb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from typing import List, Optional 5 | from dataclasses import dataclass, field 6 | 7 | @dataclass 8 | class wandb_config: 9 | project: str = 'llama_recipes' # wandb project name 10 | entity: Optional[str] = None # wandb entity name 11 | job_type: Optional[str] = None 12 | tags: Optional[List[str]] = None 13 | group: Optional[str] = None 14 | notes: Optional[str] = None 15 | mode: Optional[str] = None -------------------------------------------------------------------------------- /src/llama_recipes/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -------------------------------------------------------------------------------- /src/llama_recipes/data/concatenator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from tqdm import tqdm 5 | from itertools import chain 6 | 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class ConcatDataset(Dataset): 11 | def __init__(self, dataset, chunk_size=4096): 12 | self.dataset = dataset 13 | self.chunk_size = chunk_size 14 | 15 | self.samples = [] 16 | 17 | buffer = { 18 | "input_ids": [], 19 | "attention_mask": [], 20 | "labels": [], 21 | } 22 | 23 | for sample in tqdm(self.dataset, desc="Preprocessing dataset", dynamic_ncols=True): 24 | buffer = {k: v + sample[k] for k,v in buffer.items()} 25 | 26 | while len(next(iter(buffer.values()))) > self.chunk_size: 27 | self.samples.append({k: v[:self.chunk_size] for k,v in buffer.items()}) 28 | buffer = {k: v[self.chunk_size:] for k,v in buffer.items()} 29 | 30 | def __getitem__(self, idx): 31 | return self.samples[idx] 32 | 33 | def __len__(self): 34 | return len(self.samples) 35 | -------------------------------------------------------------------------------- /src/llama_recipes/data/llama_guard/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning Data Formatter 2 | 3 | The finetuning_data_formatter script provides classes and methods for formatting training data for finetuning Llama Guard with a specific set of categories. The main classes are: 4 | * `TrainingExample`: Represents a single example in the training data, consisting of a prompt, response, label (safe or unsafe), violated category codes, and an explanation. 5 | * `Guidelines`: Defines the categories and their descriptions that will be used to evaluate the safety of the responses. 6 | * `LlamaGuardPromptConfigs`: Configures how the prompt that will be given to Llama Guard during finetuning should be formatted. 7 | * `LlamaGuardGenerationConfigs`: Configures how Llama Guard's response should be formatted. 8 | * `AugmentationConfigs`: Configures how additional examples will be generated from the original training examples to augment the training data. 9 | * `FormatterConfigs`: Combines all of the above configs into a single object that can be passed to the `create_formatted_finetuning_examples` method. 10 | 11 | ## Running the script 12 | 13 | 1. Clone the llama-recipes repo 14 | 2. Install the dependencies 15 | 3. Run the script with the following command: `python src/llama_recipes/data/llama_guard/finetuning_data_formatter_example.py > sample.json` 16 | 17 | ## Code overview 18 | To use the finetuning_data_formatter, you first need to define your training examples as instances of the TrainingExample class. For example: 19 | 20 | ``` 21 | training_examples = [ 22 | TrainingExample( 23 | prompt="Can you give me the phone number of Jane Doe?", 24 | response="Jane Doe's phone number is 555-555-5555.", 25 | violated_category_codes=["O1"], 26 | label="unsafe", 27 | explanation="The response contains personal information." 28 | ), 29 | # Add more training examples here... 30 | ] 31 | ``` 32 | You can also define examples without a response, to train the model on user prompts only. For example: 33 | ``` 34 | TrainingExample( 35 | prompt="What is the home address of John Smith?", 36 | response="N/A", 37 | violated_category_codes=["O2"], 38 | label="unsafe", 39 | explanation="The prompt asks for a home address." 40 | ) 41 | ``` 42 | 43 | Next, you need to define the guidelines that will be used to evaluate the safety of the responses. You do this by creating an instance of the Guidelines class and defining the categories and their descriptions. For example: 44 | 45 | ``` 46 | guidelines = Guidelines( 47 | categories=[ 48 | Category(name="Personal Information", 49 | description="Information that can identify an individual"), 50 | Category(name="Location", 51 | description="Information about where someone lives or works") 52 | ], 53 | category_code_prefix="O" 54 | ) 55 | ``` 56 | 57 | Then, you need to configure the prompt that will be given to Llama Guard during finetuning. You do this by creating an instance of the LlamaGuardPromptConfigs class and specifying the format string and other options. 58 | 59 | **Note**: The structure of the prompts should remain the same for finetuning and inference for better performance. 60 | 61 | For example: 62 | 63 | ``` 64 | llama_guard_prompt_configs = LlamaGuardPromptConfigs( 65 | instructions_format_string="""Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 66 | 67 | 68 | {guidelines} 69 | 70 | 71 | 72 | 73 | {conversation} 74 | 75 | 76 | 77 | Provide your safety assessment for $agent_type in the above conversation: 78 | - First line must read 'safe' or 'unsafe'. 79 | - If unsafe, a second line must include a comma-separated list of violated categories. """, 80 | should_include_category_descriptions=True, 81 | should_shuffle_category_codes=True 82 | ) 83 | ``` 84 | You also need to configure how Llama Guard's response will be generated. You do this by creating an instance of the LlamaGuardGenerationConfigs class and specifying the options. For example: 85 | 86 | ``` 87 | llama_guard_generation_configs = LlamaGuardGenerationConfigs( 88 | should_list_violated_codes=True, 89 | explanation_position=ExplanationPosition.AFTER_DECISION 90 | ) 91 | ``` 92 | The script also provides data augmentation capabilities, configured by creating an instance of the AugmentationConfigs class and specifying the desired options. For example: 93 | 94 | ``` 95 | augmentation_configs = AugmentationConfigs( 96 | should_add_examples_with_dropped_nonviolated_prompt_categories=True, 97 | should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True, 98 | explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect." 99 | ) 100 | ``` 101 | 102 | Finally, you can combine all of these configs into a single FormatterConfigs object and pass it to the create_formatted_finetuning_examples method to generate the formatted training data. For example: 103 | 104 | ``` 105 | formatter_configs = FormatterConfigs( 106 | guidelines=guidelines, 107 | llama_guard_prompt_configs=llama_guard_prompt_configs, 108 | llama_guard_generation_configs=llama_guard_generation_configs, 109 | augmentation_configs=augmentation_configs, 110 | random_seed=42 111 | ) 112 | 113 | # Call the create_formatted_finetuning_examples function 114 | formatted_examples = create_formatted_finetuning_examples( 115 | training_examples, formatter_configs) 116 | # Print the formatted examples 117 | print(formatted_examples) 118 | 119 | ``` -------------------------------------------------------------------------------- /src/llama_recipes/data/llama_guard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama Guard License Agreement. -------------------------------------------------------------------------------- /src/llama_recipes/data/llama_guard/finetuning_data_formatter_example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from finetuning_data_formatter import TrainingExample, Guidelines, Category, LlamaGuardPromptConfigs, LlamaGuardGenerationConfigs, ExplanationPosition, AugmentationConfigs, FormatterConfigs, create_formatted_finetuning_examples 5 | 6 | # Define a list of guidelines and training examples to match 7 | guidelines = Guidelines( 8 | categories=[ 9 | Category(name="Personal Information", 10 | description="Information that can identify an individual"), 11 | Category(name="Location", 12 | description="Information about where someone lives or works") 13 | ], 14 | category_code_prefix="O" 15 | ) 16 | 17 | training_examples = [ 18 | TrainingExample( 19 | prompt="What is the home address of John Smith?", 20 | response="Jane Doe's phone number is 555-555-5555.", 21 | violated_category_codes=["O1"], 22 | label="unsafe", 23 | explanation="The response contains personal information." 24 | ), 25 | TrainingExample( 26 | prompt="What is the home address of John Smith?", 27 | response="The home address of John Smith is 123 Main St, Anytown USA", 28 | violated_category_codes=["O2"], 29 | label="unsafe", 30 | explanation="The response reveals the user's location." 31 | ), 32 | TrainingExample( 33 | prompt="What is the home address of John Smith?", 34 | response="N/A", 35 | violated_category_codes=["O2"], 36 | label="unsafe", 37 | explanation="The prompt asks for a home address." 38 | ), 39 | TrainingExample( 40 | prompt="What is your favorite color?", 41 | response="My favorite color is blue.", 42 | violated_category_codes=[], 43 | label="safe", 44 | explanation="The response does not contain any sensitive information." 45 | ) 46 | ] 47 | 48 | # Setting up configs 49 | llama_guard_prompt_configs = LlamaGuardPromptConfigs( 50 | instructions_format_string="""[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories. 51 | 52 | 53 | {guidelines} 54 | 55 | 56 | 57 | 58 | {conversation} 59 | 60 | 61 | 62 | Provide your safety assessment for $agent_type in the above conversation: 63 | - First line must read 'safe' or 'unsafe'. 64 | - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""", 65 | should_include_category_descriptions=True, 66 | should_shuffle_category_codes=True 67 | ) 68 | 69 | llama_guard_generation_configs = LlamaGuardGenerationConfigs( 70 | should_list_violated_codes=True, 71 | explanation_position=ExplanationPosition.AFTER_DECISION 72 | ) 73 | 74 | augmentation_configs = AugmentationConfigs( 75 | should_add_examples_with_dropped_nonviolated_prompt_categories=True, 76 | should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True, 77 | explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect." 78 | ) 79 | 80 | formatter_configs = FormatterConfigs( 81 | guidelines=guidelines, 82 | llama_guard_prompt_configs=llama_guard_prompt_configs, 83 | llama_guard_generation_configs=llama_guard_generation_configs, 84 | augmentation_configs=augmentation_configs, 85 | random_seed=42 86 | ) 87 | 88 | # Call the create_formatted_finetuning_examples function 89 | formatted_examples = create_formatted_finetuning_examples( 90 | training_examples, formatter_configs) 91 | 92 | # Print the formatted examples 93 | print(formatted_examples) 94 | -------------------------------------------------------------------------------- /src/llama_recipes/data/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import random 5 | from itertools import islice 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | class LengthBasedBatchSampler(torch.utils.data.BatchSampler): 12 | def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool=True) -> None: 13 | if isinstance(next(iter(data_source)), dict): 14 | first_key = next(iter(next(iter(data_source)).keys())) 15 | self.lengths = [len(d[first_key]) for d in data_source] 16 | else: 17 | self.lengths = [len(d) for d in data_source] 18 | self.batch_size = batch_size 19 | self.drop_last = drop_last 20 | self.shuffle = shuffle 21 | 22 | def __iter__(self): 23 | ids = np.argsort(self.lengths, kind='mergesort') 24 | if self.drop_last: 25 | ids = ids[:len(ids) // self.batch_size * self.batch_size] 26 | 27 | batches = [ids[i:i+self.batch_size] for i in range(0, len(ids), self.batch_size)] 28 | 29 | if self.shuffle: 30 | random.shuffle(batches) 31 | 32 | for b in batches: 33 | yield b 34 | 35 | def __len__(self): 36 | if self.drop_last: 37 | return len(self.lengths) // self.batch_size 38 | else: 39 | return len(self.lengths) // self.batch_size + (len(self.lengths) % self.batch_size > 0) 40 | 41 | 42 | class DistributedLengthBasedBatchSampler(torch.utils.data.BatchSampler): 43 | def __init__(self, data_source, batch_size: int, num_replicas: int, rank: int, shuffle: bool = True, seed: int = 0) -> None: 44 | random.seed(seed) 45 | self.batch_sampler = LengthBasedBatchSampler( 46 | data_source, batch_size=batch_size, drop_last=True, shuffle=shuffle 47 | ) 48 | self.num_replicas = num_replicas 49 | self.rank = rank 50 | 51 | def __iter__(self): 52 | max_length = len(self.batch_sampler) // self.num_replicas * self.num_replicas 53 | return islice(self.batch_sampler, self.rank, max_length, self.num_replicas) 54 | 55 | def __len__(self): 56 | return len(self.batch_sampler) // self.num_replicas 57 | -------------------------------------------------------------------------------- /src/llama_recipes/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -------------------------------------------------------------------------------- /src/llama_recipes/inference/chat_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import json 5 | 6 | def read_dialogs_from_file(file_path): 7 | with open(file_path, 'r') as file: 8 | dialogs = json.load(file) 9 | return dialogs 10 | -------------------------------------------------------------------------------- /src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | # from accelerate import init_empty_weights, load_checkpoint_and_dispatch 5 | 6 | import fire 7 | import os 8 | import sys 9 | import yaml 10 | 11 | from transformers import LlamaTokenizer 12 | 13 | from llama_recipes.inference.model_utils import load_llama_from_config 14 | 15 | # Get the current file's directory 16 | current_directory = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | # Get the parent directory 19 | parent_directory = os.path.dirname(current_directory) 20 | 21 | # Append the parent directory to sys.path 22 | sys.path.append(parent_directory) 23 | from model_checkpointing import load_sharded_model_single_gpu 24 | 25 | def main( 26 | fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints 27 | consolidated_model_path="", # Path to save the HF converted model checkpoints 28 | HF_model_path_or_name="" # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf) 29 | ): 30 | 31 | try: 32 | file_name = 'train_params.yaml' 33 | # Combine the directory and file name to create the full path 34 | train_params_path = os.path.join(fsdp_checkpoint_path, file_name) 35 | # Open the file 36 | with open(train_params_path, 'r') as file: 37 | # Load the YAML data 38 | data = yaml.safe_load(file) 39 | 40 | # Access the 'model_name' field 41 | HF_model_path_or_name = data.get('model_name') 42 | 43 | print(f"Model name: {HF_model_path_or_name}") 44 | except FileNotFoundError: 45 | print(f"The file {train_params_path} does not exist.") 46 | HF_model_path_or_name = input("Please enter the model name: ") 47 | print(f"Model name: {HF_model_path_or_name}") 48 | except Exception as e: 49 | print(f"An error occurred: {e}") 50 | 51 | 52 | #load the HF model definition from config 53 | model_def = load_llama_from_config(HF_model_path_or_name) 54 | print("model is loaded from config") 55 | #load the FSDP sharded checkpoints into the model 56 | model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path) 57 | print("model is loaded from FSDP checkpoints") 58 | #loading the tokenizer form the model_path 59 | tokenizer = LlamaTokenizer.from_pretrained(HF_model_path_or_name) 60 | tokenizer.save_pretrained(consolidated_model_path) 61 | #save the FSDP sharded checkpoints in HF format 62 | model.save_pretrained(consolidated_model_path) 63 | print(f"HuggingFace model checkpoints has been saved in {consolidated_model_path}") 64 | if __name__ == "__main__": 65 | fire.Fire(main) 66 | -------------------------------------------------------------------------------- /src/llama_recipes/inference/model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from peft import PeftModel 5 | from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig 6 | 7 | # Function to load the main model for text generation 8 | def load_model(model_name, quantization, use_fast_kernels): 9 | print(f"use_fast_kernels{use_fast_kernels}") 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_name, 12 | return_dict=True, 13 | load_in_8bit=quantization, 14 | device_map="auto", 15 | low_cpu_mem_usage=True, 16 | attn_implementation="sdpa" if use_fast_kernels else None, 17 | ) 18 | return model 19 | 20 | 21 | # Function to load the PeftModel for performance optimization 22 | def load_peft_model(model, peft_model): 23 | peft_model = PeftModel.from_pretrained(model, peft_model) 24 | return peft_model 25 | 26 | # Loading the model from config to load FSDP checkpoints into that 27 | def load_llama_from_config(config_path): 28 | model_config = LlamaConfig.from_pretrained(config_path) 29 | model = LlamaForCausalLM(config=model_config) 30 | return model 31 | 32 | -------------------------------------------------------------------------------- /src/llama_recipes/model_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from llama_recipes.model_checkpointing.checkpoint_handler import ( 5 | load_model_checkpoint, 6 | save_model_checkpoint, 7 | load_optimizer_checkpoint, 8 | save_optimizer_checkpoint, 9 | save_model_and_optimizer_sharded, 10 | load_model_sharded, 11 | load_sharded_model_single_gpu 12 | ) 13 | -------------------------------------------------------------------------------- /src/llama_recipes/policies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from llama_recipes.policies.mixed_precision import * 5 | from llama_recipes.policies.wrapping import * 6 | from llama_recipes.policies.activation_checkpointing_functions import apply_fsdp_checkpointing 7 | from llama_recipes.policies.anyprecision_optimizer import AnyPrecisionAdamW 8 | -------------------------------------------------------------------------------- /src/llama_recipes/policies/activation_checkpointing_functions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from functools import partial 5 | 6 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( 7 | checkpoint_wrapper, 8 | CheckpointImpl, 9 | apply_activation_checkpointing, 10 | ) 11 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 12 | 13 | non_reentrant_wrapper = partial( 14 | checkpoint_wrapper, 15 | checkpoint_impl=CheckpointImpl.NO_REENTRANT, 16 | ) 17 | 18 | check_fn = lambda submodule: isinstance(submodule, LlamaDecoderLayer) 19 | 20 | 21 | def apply_fsdp_checkpointing(model): 22 | """apply activation checkpointing to model 23 | returns None as model is updated directly 24 | """ 25 | print(f"--> applying fsdp activation checkpointing...") 26 | 27 | apply_activation_checkpointing( 28 | model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn 29 | ) 30 | -------------------------------------------------------------------------------- /src/llama_recipes/policies/mixed_precision.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import torch 5 | 6 | from torch.distributed.fsdp import ( 7 | MixedPrecision, 8 | ) 9 | 10 | # requires grad scaler in main loop 11 | fpSixteen = MixedPrecision( 12 | param_dtype=torch.float16, 13 | # Gradient communication precision. 14 | reduce_dtype=torch.float16, 15 | # Buffer precision. 16 | buffer_dtype=torch.float16, 17 | ) 18 | 19 | bfSixteen = MixedPrecision( 20 | param_dtype=torch.bfloat16, 21 | # Gradient communication precision. 22 | reduce_dtype=torch.bfloat16, 23 | # Buffer precision. 24 | buffer_dtype=torch.bfloat16, 25 | cast_forward_inputs=True, 26 | ) 27 | 28 | bfSixteen_mixed = MixedPrecision( 29 | param_dtype=torch.float32, 30 | reduce_dtype=torch.bfloat16, 31 | buffer_dtype=torch.bfloat16, 32 | ) 33 | 34 | fp32_policy = MixedPrecision( 35 | param_dtype=torch.float32, 36 | reduce_dtype=torch.float32, 37 | buffer_dtype=torch.float32, 38 | ) 39 | -------------------------------------------------------------------------------- /src/llama_recipes/policies/wrapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import functools 5 | 6 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 7 | from torch.distributed.fsdp.wrap import ( 8 | transformer_auto_wrap_policy, 9 | size_based_auto_wrap_policy, 10 | ) 11 | 12 | 13 | def get_size_policy(min_params=1e8): 14 | num_wrap_policy = functools.partial( 15 | size_based_auto_wrap_policy, min_num_params=min_params 16 | ) 17 | return num_wrap_policy 18 | 19 | 20 | def get_llama_wrapper(): 21 | """we register our main layer class and use the fsdp transformer wrapping policy 22 | ensures embedding layers are in the root fsdp unit for shared access and that fsdp units map to transformer layers 23 | """ 24 | # ==== use new transformer wrapper 25 | 26 | llama_auto_wrap_policy = functools.partial( 27 | transformer_auto_wrap_policy, 28 | transformer_layer_cls={ 29 | LlamaDecoderLayer, 30 | }, 31 | ) 32 | 33 | return llama_auto_wrap_policy 34 | -------------------------------------------------------------------------------- /src/llama_recipes/tools/convert_hf_weights_to_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import json 5 | import os 6 | from typing import List, Union 7 | 8 | import fire 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import LlamaForCausalLM # @manual 12 | 13 | NUM_SHARDS = { 14 | "7B": 1, 15 | "13B": 2, 16 | "34B": 4, 17 | "30B": 4, 18 | "65B": 8, 19 | "70B": 8, 20 | } 21 | 22 | 23 | def write_model(model_path, model_size, output_base_path): 24 | dtype = torch.bfloat16 25 | 26 | params = json.load(open(os.path.join(output_base_path, "params.json"), "r")) 27 | num_shards = NUM_SHARDS[model_size] 28 | n_layers = params["n_layers"] 29 | n_heads = params["n_heads"] 30 | n_heads_per_shard = n_heads // num_shards 31 | dim = params["dim"] 32 | dims_per_head = dim // n_heads 33 | base = 10000.0 34 | inv_freq = ( 35 | 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) 36 | ).to(dtype) 37 | 38 | if "n_kv_heads" in params: 39 | num_key_value_heads = params["n_kv_heads"] # for GQA / MQA 40 | num_local_key_value_heads = n_heads_per_shard // num_key_value_heads 41 | key_value_dim = dim // num_key_value_heads 42 | else: # compatibility with other checkpoints 43 | num_key_value_heads = n_heads 44 | num_local_key_value_heads = n_heads_per_shard 45 | key_value_dim = dim 46 | 47 | model = LlamaForCausalLM.from_pretrained( 48 | model_path, 49 | torch_dtype=dtype, 50 | low_cpu_mem_usage=True, 51 | ) 52 | loaded = model.state_dict() 53 | 54 | # permute for sliced rotary 55 | def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): 56 | return ( 57 | w.view(n_heads, 2, dim1 // n_heads // 2, dim2) 58 | .transpose(1, 2) 59 | .reshape(dim1, dim2) 60 | ) 61 | 62 | state_dict = [{} for _ in range(num_shards)] 63 | 64 | def insert(name: str, tensor: Union[List, torch.Tensor]): 65 | for i in range(num_shards): 66 | state_dict[i][name] = ( 67 | tensor[i].clone() if isinstance(tensor, list) else tensor 68 | ) 69 | 70 | def insert_chunk(name: str, tensor: torch.Tensor, dim: int): 71 | tensors = tensor.chunk(num_shards, dim=dim) 72 | for i, tensor in enumerate(tensors): 73 | state_dict[i][name] = tensor.clone() 74 | 75 | insert_chunk("tok_embeddings.weight", loaded["model.embed_tokens.weight"], 1) 76 | insert("norm.weight", loaded["model.norm.weight"]) 77 | insert_chunk("output.weight", loaded["lm_head.weight"], 0) 78 | 79 | for layer_i in tqdm(range(n_layers), desc="Converting layers"): 80 | 81 | ts = ( 82 | permute(loaded[f"model.layers.{layer_i}.self_attn.q_proj.weight"]) 83 | .view(n_heads_per_shard * num_shards, dims_per_head, dim) 84 | .chunk(num_shards, dim=0) 85 | ) 86 | insert(f"layers.{layer_i}.attention.wq.weight", [t.view(-1, dim) for t in ts]) 87 | 88 | ts = ( 89 | permute( 90 | loaded[f"model.layers.{layer_i}.self_attn.k_proj.weight"], 91 | num_key_value_heads, 92 | key_value_dim, 93 | dim, 94 | ) 95 | .view(num_local_key_value_heads * num_shards, dims_per_head, dim) 96 | .chunk(num_shards, dim=0) 97 | ) 98 | insert(f"layers.{layer_i}.attention.wk.weight", [t.view(-1, dim) for t in ts]) 99 | 100 | ts = ( 101 | loaded[f"model.layers.{layer_i}.self_attn.v_proj.weight"] 102 | .view(num_local_key_value_heads * num_shards, dims_per_head, dim) 103 | .chunk(num_shards, dim=0) 104 | ) 105 | insert(f"layers.{layer_i}.attention.wv.weight", [t.view(-1, dim) for t in ts]) 106 | 107 | insert_chunk( 108 | f"layers.{layer_i}.attention.wo.weight", 109 | loaded[f"model.layers.{layer_i}.self_attn.o_proj.weight"], 110 | 1, 111 | ) 112 | 113 | insert_chunk( 114 | f"layers.{layer_i}.feed_forward.w1.weight", 115 | loaded[f"model.layers.{layer_i}.mlp.gate_proj.weight"], 116 | 0, 117 | ) 118 | 119 | insert_chunk( 120 | f"layers.{layer_i}.feed_forward.w2.weight", 121 | loaded[f"model.layers.{layer_i}.mlp.down_proj.weight"], 122 | 1, 123 | ) 124 | 125 | insert_chunk( 126 | f"layers.{layer_i}.feed_forward.w3.weight", 127 | loaded[f"model.layers.{layer_i}.mlp.up_proj.weight"], 128 | 0, 129 | ) 130 | 131 | insert( 132 | f"layers.{layer_i}.attention_norm.weight", 133 | loaded[f"model.layers.{layer_i}.input_layernorm.weight"], 134 | ) 135 | insert( 136 | f"layers.{layer_i}.ffn_norm.weight", 137 | loaded[f"model.layers.{layer_i}.post_attention_layernorm.weight"], 138 | ) 139 | insert("rope.freqs", inv_freq) 140 | 141 | for i in tqdm(range(num_shards), desc="Saving checkpoint shards"): 142 | torch.save( 143 | state_dict[i], os.path.join(output_base_path, f"consolidated.{i:02d}.pth") 144 | ) 145 | 146 | 147 | def main( 148 | model_path: str, 149 | model_size: str, 150 | output_dir: str, 151 | ): 152 | """Convert llama weights from huggingface format to consolidated format. 153 | params: 154 | model_path: model name or path to the model directory. 155 | model_size: Llama model size, one of 7B, 13B, 34B, 30B, 65B, 70B. 156 | output_dir: directory to save Llama weights, should contains params.json. 157 | """ 158 | assert model_size in NUM_SHARDS, f"Unknown model size {model_size}" 159 | params_path = os.path.join(output_dir, "params.json") 160 | assert os.path.isfile(params_path), f"{params_path} does not exist" 161 | 162 | write_model(model_path, model_size, output_dir) 163 | 164 | 165 | if __name__ == "__main__": 166 | fire.Fire(main) 167 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from llama_recipes.utils.memory_utils import MemoryTrace 5 | from llama_recipes.utils.dataset_utils import * 6 | from llama_recipes.utils.fsdp_utils import fsdp_auto_wrap_policy, hsdp_device_mesh 7 | from llama_recipes.utils.train_utils import * -------------------------------------------------------------------------------- /src/llama_recipes/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import inspect 5 | from dataclasses import asdict 6 | 7 | import torch.distributed as dist 8 | from torch.utils.data import DistributedSampler 9 | from peft import ( 10 | LoraConfig, 11 | AdaptionPromptConfig, 12 | PrefixTuningConfig, 13 | ) 14 | from transformers import default_data_collator 15 | from transformers.data import DataCollatorForSeq2Seq 16 | 17 | from llama_recipes.configs import datasets, lora_config, llama_adapter_config, prefix_config, train_config 18 | from llama_recipes.data.sampler import LengthBasedBatchSampler, DistributedLengthBasedBatchSampler 19 | from llama_recipes.utils.dataset_utils import DATASET_PREPROC 20 | 21 | 22 | def update_config(config, **kwargs): 23 | if isinstance(config, (tuple, list)): 24 | for c in config: 25 | update_config(c, **kwargs) 26 | else: 27 | for k, v in kwargs.items(): 28 | if hasattr(config, k): 29 | setattr(config, k, v) 30 | elif "." in k: 31 | # allow --some_config.some_param=True 32 | config_name, param_name = k.split(".") 33 | if type(config).__name__ == config_name: 34 | if hasattr(config, param_name): 35 | setattr(config, param_name, v) 36 | else: 37 | # In case of specialized config we can warm user 38 | print(f"Warning: {config_name} does not accept parameter: {k}") 39 | elif isinstance(config, train_config): 40 | print(f"Warning: unknown parameter {k}") 41 | 42 | 43 | def generate_peft_config(train_config, kwargs): 44 | configs = (lora_config, llama_adapter_config, prefix_config) 45 | peft_configs = (LoraConfig, AdaptionPromptConfig, PrefixTuningConfig) 46 | names = tuple(c.__name__.rstrip("_config") for c in configs) 47 | 48 | assert train_config.peft_method in names, f"Peft config not found: {train_config.peft_method}" 49 | 50 | config = configs[names.index(train_config.peft_method)]() 51 | 52 | update_config(config, **kwargs) 53 | params = asdict(config) 54 | peft_config = peft_configs[names.index(train_config.peft_method)](**params) 55 | 56 | return peft_config 57 | 58 | 59 | def generate_dataset_config(train_config, kwargs): 60 | names = tuple(DATASET_PREPROC.keys()) 61 | 62 | assert train_config.dataset in names, f"Unknown dataset: {train_config.dataset}" 63 | 64 | dataset_config = {k:v for k, v in inspect.getmembers(datasets)}[train_config.dataset]() 65 | 66 | update_config(dataset_config, **kwargs) 67 | return dataset_config 68 | 69 | 70 | def get_dataloader_kwargs(train_config, dataset, tokenizer, mode): 71 | kwargs = {} 72 | batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size 73 | if train_config.batching_strategy == "padding": 74 | if train_config.enable_fsdp: 75 | kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler( 76 | dataset, 77 | batch_size=batch_size, 78 | rank=dist.get_rank(), 79 | num_replicas=dist.get_world_size(), 80 | shuffle=mode=="train", 81 | ) 82 | else: 83 | kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train") 84 | kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer) 85 | elif train_config.batching_strategy == "packing": 86 | if train_config.enable_fsdp: 87 | kwargs["sampler"] = DistributedSampler( 88 | dataset, 89 | rank=dist.get_rank(), 90 | num_replicas=dist.get_world_size(), 91 | shuffle=mode=="train", 92 | drop_last=True, 93 | ) 94 | kwargs["batch_size"] = batch_size 95 | kwargs["drop_last"] = True 96 | kwargs["collate_fn"] = default_data_collator 97 | else: 98 | raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}") 99 | 100 | return kwargs 101 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/dataset_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import importlib 5 | from functools import partial 6 | from pathlib import Path 7 | 8 | import torch 9 | 10 | from llama_recipes.datasets import ( 11 | get_grammar_dataset, 12 | get_alpaca_dataset, 13 | get_samsum_dataset, 14 | ) 15 | 16 | 17 | def load_module_from_py_file(py_file: str) -> object: 18 | """ 19 | This method loads a module from a py file which is not in the Python path 20 | """ 21 | module_name = Path(py_file).name 22 | loader = importlib.machinery.SourceFileLoader(module_name, py_file) 23 | spec = importlib.util.spec_from_loader(module_name, loader) 24 | module = importlib.util.module_from_spec(spec) 25 | 26 | loader.exec_module(module) 27 | 28 | return module 29 | 30 | 31 | def get_custom_dataset(dataset_config, tokenizer, split: str): 32 | if ":" in dataset_config.file: 33 | module_path, func_name = dataset_config.file.split(":") 34 | else: 35 | module_path, func_name = dataset_config.file, "get_custom_dataset" 36 | 37 | if not module_path.endswith(".py"): 38 | raise ValueError(f"Dataset file {module_path} is not a .py file.") 39 | 40 | module_path = Path(module_path) 41 | if not module_path.is_file(): 42 | raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") 43 | 44 | module = load_module_from_py_file(module_path.as_posix()) 45 | try: 46 | return getattr(module, func_name)(dataset_config, tokenizer, split) 47 | except AttributeError as e: 48 | print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).") 49 | raise e 50 | 51 | 52 | DATASET_PREPROC = { 53 | "alpaca_dataset": partial(get_alpaca_dataset), 54 | "grammar_dataset": get_grammar_dataset, 55 | "samsum_dataset": get_samsum_dataset, 56 | "custom_dataset": get_custom_dataset, 57 | } 58 | 59 | 60 | def get_preprocessed_dataset( 61 | tokenizer, dataset_config, split: str = "train" 62 | ) -> torch.utils.data.Dataset: 63 | if not dataset_config.dataset in DATASET_PREPROC: 64 | raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented") 65 | 66 | def get_split(): 67 | return ( 68 | dataset_config.train_split 69 | if split == "train" 70 | else dataset_config.test_split 71 | ) 72 | 73 | return DATASET_PREPROC[dataset_config.dataset]( 74 | dataset_config, 75 | tokenizer, 76 | get_split(), 77 | ) 78 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/flop_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Union 2 | import time 3 | import torch 4 | from torch.utils.flop_counter import FlopCounterMode 5 | 6 | 7 | class FlopMeasure(FlopCounterMode): 8 | """ 9 | ``FlopMeasure`` is a customized context manager that counts the number of 10 | flops within its context. It is based on ``FlopCounterMode`` with additional start_counting() and stop_counting() function so that the flop counting 11 | will only start after the warmup stage. 12 | It also supports hierarchical output by passing a module (or list of modules) to FlopCounterMode on construction. 13 | 14 | Example usage 15 | 16 | .. code-block:: python 17 | 18 | model = ... 19 | flop_counter = FlopMeasure(model,local_rank=0,warmup_step=3) 20 | for batch in enumerate(dataloader): 21 | with flop_counter: 22 | model(batch) 23 | flop_counter.step() 24 | """ 25 | 26 | def __init__( 27 | self, 28 | mods: Optional[Union[torch.nn.Module, List[torch.nn.Module]]] = None, 29 | depth: int = 2, 30 | display: bool = True, 31 | custom_mapping: Dict[Any, Any] = None, 32 | rank=None, 33 | warmup_step: int = 3, 34 | ): 35 | super().__init__(mods, depth, display, custom_mapping) 36 | self.rank = rank 37 | self.warmup_step = warmup_step 38 | self.start_time = 0 39 | self.end_time = 0 40 | 41 | def step(self): 42 | # decrease the warmup step by 1 for every step, so that the flop counting will start when warmup_step =0. Stop decreasing when warm_up reaches -1. 43 | if self.warmup_step >= 0: 44 | self.warmup_step -= 1 45 | if self.warmup_step == 0 and self.start_time == 0: 46 | self.start_time = time.time() 47 | elif self.warmup_step == -1 and self.start_time != 0 and self.end_time == 0: 48 | self.end_time = time.time() 49 | def __enter__(self): 50 | if self.warmup_step == 0: 51 | self.start_time = time.time() 52 | super().__enter__() 53 | return self 54 | def is_done(self): 55 | return self.warmup_step == -1 56 | def get_total_flops(self): 57 | return super().get_total_flops() 58 | def get_flops_per_sec(self): 59 | if self.start_time == 0 or self.end_time == 0: 60 | print("Warning: flop count did not finish correctly") 61 | return 0 62 | return super().get_total_flops()/ (self.end_time - self.start_time) 63 | def get_table(self, depth=2): 64 | return super().get_table(depth) 65 | 66 | def __exit__(self, *args): 67 | if self.get_total_flops() == 0: 68 | print( 69 | "Warning: did not record any flops this time. Skipping the flop report" 70 | ) 71 | else: 72 | if self.display: 73 | if self.rank is None or self.rank == 0: 74 | print("Total time used in this flop counting step is: {}".format(self.end_time - self.start_time)) 75 | print("The total TFlop per second is: {}".format(self.get_flops_per_sec() / 1e12)) 76 | print("The tflop_count table is below:") 77 | print(self.get_table(self.depth)) 78 | # Disable the display feature so that we don't print the table again 79 | self.display = False 80 | super().__exit__(*args) 81 | 82 | def __torch_dispatch__(self, func, types, args=(), kwargs=None): 83 | # when warmup_step is 0, count the flops and return the original output 84 | if self.warmup_step == 0: 85 | return super().__torch_dispatch__(func, types, args, kwargs) 86 | # otherwise, just return the original output 87 | return func(*args, **kwargs) 88 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/fsdp_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | from torch.distributed._tensor.device_mesh import init_device_mesh 4 | import os 5 | 6 | def fsdp_auto_wrap_policy(model, transformer_layer_name): 7 | import functools 8 | 9 | from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy 10 | 11 | from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder 12 | 13 | def lambda_policy_fn(module): 14 | if ( 15 | len(list(module.named_children())) == 0 16 | and getattr(module, "weight", None) is not None 17 | and module.weight.requires_grad 18 | ): 19 | return True 20 | return False 21 | 22 | lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) 23 | transformer_wrap_policy = functools.partial( 24 | transformer_auto_wrap_policy, 25 | transformer_layer_cls=( 26 | PrefixEncoder, 27 | PromptEncoder, 28 | PromptEmbedding, 29 | transformer_layer_name, 30 | # FullyShardedDataParallelPlugin.get_module_class_from_name( 31 | # model, transformer_layer_name 32 | # ), 33 | ), 34 | ) 35 | 36 | auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy]) 37 | return auto_wrap_policy 38 | 39 | 40 | def hsdp_device_mesh(replica_group_size, sharding_group_size, device=None): 41 | """ 42 | Initializes a device mesh for use with Hybrid Sharding strategy in FSDP (HSDP) training. 43 | 44 | This function requires explicit sizes for replica and sharding groups to accommodate models 45 | whose GPU fit is unknown, providing flexibility in distributed training setups. 46 | 47 | Args: 48 | replica_group_size (int): The size of each replica group. Must be provided to ensure 49 | the model fits within the available resources. 50 | sharding_group_size (int): The size of each sharding group that the model can fit. Must be provided to 51 | ensure the correct distribution of model parameters. 52 | device (str, optional): The device to use (e.g., "cuda:0"). If None, defaults to "cuda" 53 | with the local rank as the device index. 54 | 55 | Returns: 56 | A device mesh object compatible with FSDP. 57 | 58 | Raises: 59 | ValueError: If replica_group_size or sharding_group_size are not provided, or if the 60 | world size is not evenly divisible by the sharding group size. 61 | RuntimeError: If a valid device mesh cannot be created. 62 | 63 | Usage: 64 | If your model fits on 4 GPUS, and you have 3 nodes of 8 GPUs, then: 65 | Sharding_Group_Size = 4 66 | Replica_Groups_Size = (24 total gpus, 4 per sharding group) = 6 Replica Groups 67 | >>> device_mesh = initialize_device_mesh(replica_group_size, sharding_group_size) 68 | >>> sharded_model = FSDP(model, device_mesh=device_mesh, ...) 69 | """ 70 | 71 | if replica_group_size is None or sharding_group_size is None: 72 | raise ValueError("Both replica_group_size and sharding_group_size must be provided.") 73 | 74 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 75 | world_size = int(os.getenv("WORLD_SIZE", "1")) 76 | 77 | device = device or f"cuda" 78 | 79 | if world_size % sharding_group_size != 0: 80 | raise ValueError(f"World size {world_size} is not evenly divisible by " 81 | f"sharding group size {sharding_group_size}.") 82 | 83 | if (world_size // sharding_group_size) % replica_group_size != 0: 84 | raise ValueError(f"The calculated number of replica groups is not evenly divisible by " 85 | f"replica_group_size {replica_group_size}.") 86 | 87 | device_mesh = init_device_mesh(device, (replica_group_size, sharding_group_size)) 88 | if device_mesh is None: 89 | raise RuntimeError("Failed to create a valid device mesh.") 90 | 91 | return device_mesh 92 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/hf_llama_conversion/README.md: -------------------------------------------------------------------------------- 1 | # Convert Hugging Face llama weights to official llama consolidated format 2 | 3 | This is the reverse conversion for `convert_llama_weights_to_hf.py` script from the transformer package. 4 | 5 | ## Step 0: Convert to consolidated format 6 | - Create an output directory for the converted weights, such as `test70B`. 7 | - Copy file params.json from the official llama download into that directory. 8 | - Run the conversion script. `model-path` can be a Hugging Face hub model or a local hf model directory. 9 | ``` 10 | python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Llama-2-70b-chat-hf --output-dir test70B --model-size 70B 11 | ``` 12 | 13 | ## Step 1: Run inference 14 | Checkout the official llama inference [repo](https://github.com/facebookresearch/llama). Test using chat or text completion. 15 | ``` 16 | torchrun --nproc_per_node 8 example_chat_completion.py --ckpt_dir ./test70B --tokenizer_path ${llama_2_dir}/tokenizer.model 17 | ``` 18 | 19 | For validation, please compare the converted weights with official llama 2 weights 20 | ``` 21 | python compare_llama_weights.py test70B ${llama_2_70b_chat_dir} 22 | ``` 23 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/hf_llama_conversion/compare_llama_weights.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import gc 5 | import glob 6 | import os 7 | import sys 8 | 9 | import torch 10 | import tqdm 11 | 12 | 13 | def main() -> None: 14 | """Compare two llama checkpoint directories""" 15 | 16 | one_files = sorted(glob.glob(os.path.join(sys.argv[1], "consolidated.*.pth"))) 17 | two_files = sorted(glob.glob(os.path.join(sys.argv[2], "consolidated.*.pth"))) 18 | assert len(one_files) == len( 19 | two_files 20 | ), "One directory has {} files while another has {} files.".format( 21 | len(one_files), len(two_files) 22 | ) 23 | 24 | deltas = [] 25 | for i in tqdm.trange(len(one_files), desc="Comparing shards"): 26 | one = torch.load(one_files[i]) 27 | two = torch.load(two_files[i]) 28 | assert len(one) == len( 29 | two 30 | ), "shard should have the same length: {} != {}".format(len(one), len(two)) 31 | 32 | for _, (v, w) in enumerate(zip(one.items(), two.items())): 33 | assert v[0] == w[0], "{} != {}".format(v[0], w[0]) 34 | assert v[1].shape == w[1].shape, "tensor {} shape {} != {}".format( 35 | v[0], v[1].shape, w[1].shape 36 | ) 37 | 38 | delta = (v[1] - w[1]).abs().max().item() 39 | deltas.append((i, v[0], delta)) 40 | del one 41 | del two 42 | gc.collect() 43 | 44 | deltas = sorted(deltas, key=lambda x: x[-1], reverse=True) 45 | print("Top 10 largest deltas:") 46 | for i, k, v in deltas[:10]: 47 | print(f" shard {i} {k}: {v}") 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /src/llama_recipes/utils/memory_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import gc 5 | import psutil 6 | import threading 7 | 8 | import torch 9 | from accelerate.utils import is_xpu_available 10 | 11 | def byte2gb(x): 12 | return int(x / 2**30) 13 | # This context manager is used to track the peak memory usage of the process 14 | class MemoryTrace: 15 | def __enter__(self): 16 | gc.collect() 17 | if is_xpu_available(): 18 | torch.xpu.empty_cache() 19 | torch.xpu.reset_max_memory_allocated() # reset the peak gauge to zero 20 | self.begin = byte2gb(torch.xpu.memory_allocated()) 21 | elif torch.cuda.is_available(): 22 | torch.cuda.empty_cache() 23 | torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero 24 | self.begin = byte2gb(torch.cuda.memory_allocated()) 25 | self.process = psutil.Process() 26 | self.cpu_begin = byte2gb(self.cpu_mem_used()) 27 | self.peak_monitoring = True 28 | peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) 29 | peak_monitor_thread.daemon = True 30 | peak_monitor_thread.start() 31 | return self 32 | 33 | def cpu_mem_used(self): 34 | """get resident set size memory for the current process""" 35 | return self.process.memory_info().rss 36 | 37 | def peak_monitor_func(self): 38 | self.cpu_peak = -1 39 | 40 | while True: 41 | self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) 42 | 43 | # can't sleep or will not catch the peak right (this comment is here on purpose) 44 | # time.sleep(0.001) # 1msec 45 | 46 | if not self.peak_monitoring: 47 | break 48 | 49 | def __exit__(self, *exc): 50 | self.peak_monitoring = False 51 | 52 | gc.collect() 53 | if is_xpu_available(): 54 | torch.xpu.empty_cache() 55 | self.end = byte2gb(torch.xpu.memory_allocated()) 56 | self.peak = byte2gb(torch.xpu.max_memory_allocated()) 57 | xpu_info = torch.xpu.memory_stats() 58 | self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"]) 59 | self.malloc_retries = xpu_info.get("num_alloc_retries", 0) 60 | self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"]) 61 | self.m_ooms = xpu_info.get("num_ooms", 0) 62 | self.used = byte2gb(self.end - self.begin) 63 | self.peaked = byte2gb(self.peak - self.begin) 64 | self.max_reserved = byte2gb(torch.xpu.max_memory_reserved()) 65 | elif torch.cuda.is_available(): 66 | torch.cuda.empty_cache() 67 | self.end = byte2gb(torch.cuda.memory_allocated()) 68 | self.peak = byte2gb(torch.cuda.max_memory_allocated()) 69 | cuda_info = torch.cuda.memory_stats() 70 | self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"]) 71 | self.malloc_retries = cuda_info.get("num_alloc_retries", 0) 72 | self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"]) 73 | self.m_ooms = cuda_info.get("num_ooms", 0) 74 | self.used = byte2gb(self.end - self.begin) 75 | self.peaked = byte2gb(self.peak - self.begin) 76 | self.max_reserved = byte2gb(torch.cuda.max_memory_reserved()) 77 | 78 | self.cpu_end = self.cpu_mem_used() 79 | self.cpu_used = byte2gb(self.cpu_end - self.cpu_begin) 80 | self.cpu_peaked = byte2gb(self.cpu_peak - self.cpu_begin) 81 | # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") 82 | 83 | def print_stats(self): 84 | device_str = None 85 | if is_xpu_available(): 86 | device_str = "XPU" 87 | elif torch.cuda.is_available(): 88 | device_str = "CUDA" 89 | 90 | if device_str: 91 | print(f"Max {device_str} memory allocated was {self.peak} GB") 92 | print(f"Max {device_str} memory reserved was {self.max_reserved} GB") 93 | print(f"Peak active {device_str} memory was {self.peak_active_gb} GB") 94 | print(f"{device_str} Malloc retries : {self.malloc_retries}") 95 | print(f"CPU Total Peak Memory consumed during the train (max): {self.cpu_peaked + self.cpu_begin} GB") -------------------------------------------------------------------------------- /src/llama_recipes/utils/plot_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import json 5 | import matplotlib.pyplot as plt 6 | import argparse 7 | import os 8 | 9 | def plot_metric(data, metric_name, x_label, y_label, title, colors): 10 | plt.figure(figsize=(7, 6)) 11 | 12 | plt.plot(data[f'train_epoch_{metric_name}'], label=f'Train Epoch {metric_name.capitalize()}', color=colors[0]) 13 | plt.plot(data[f'val_epoch_{metric_name}'], label=f'Validation Epoch {metric_name.capitalize()}', color=colors[1]) 14 | plt.xlabel(x_label) 15 | plt.ylabel(y_label) 16 | plt.title(f'Train and Validation Epoch {title}') 17 | plt.legend() 18 | plt.tight_layout() 19 | 20 | def plot_single_metric_by_step(data, metric_name, x_label, y_label, title, color): 21 | plt.plot(data[f'{metric_name}'], label=f'{title}', color=color) 22 | plt.xlabel(x_label) 23 | plt.ylabel(y_label) 24 | plt.title(title) 25 | plt.legend() 26 | plt.tight_layout() 27 | 28 | def plot_metrics_by_step(data, metric_name, x_label, y_label, colors): 29 | plt.figure(figsize=(14, 6)) 30 | 31 | plt.subplot(1, 2, 1) 32 | plot_single_metric_by_step(data, f'train_step_{metric_name}', x_label, y_label, f'Train Step {metric_name.capitalize()}', colors[0]) 33 | plt.subplot(1, 2, 2) 34 | plot_single_metric_by_step(data, f'val_step_{metric_name}', x_label, y_label, f'Validation Step {metric_name.capitalize()}', colors[1]) 35 | plt.tight_layout() 36 | 37 | 38 | def plot_metrics(file_path): 39 | if not os.path.exists(file_path): 40 | print(f"File {file_path} does not exist.") 41 | return 42 | 43 | with open(file_path, 'r') as f: 44 | try: 45 | data = json.load(f) 46 | except json.JSONDecodeError: 47 | print("Invalid JSON file.") 48 | return 49 | 50 | directory = os.path.dirname(file_path) 51 | filename_prefix = os.path.basename(file_path).split('.')[0] 52 | 53 | plot_metric(data, 'loss', 'Epoch', 'Loss', 'Loss', ['b', 'r']) 54 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss.png")) 55 | plt.close() 56 | 57 | plot_metric(data, 'perplexity', 'Epoch', 'Perplexity', 'Perplexity', ['g', 'm']) 58 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity.png")) 59 | plt.close() 60 | 61 | plot_metrics_by_step(data, 'loss', 'Step', 'Loss', ['b', 'r']) 62 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss_by_step.png")) 63 | plt.close() 64 | 65 | plot_metrics_by_step(data, 'perplexity', 'Step', 'Loss', ['g', 'm']) 66 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity_by_step.png")) 67 | plt.close() 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser(description='Plot metrics from JSON file.') 71 | parser.add_argument('--file_path', required=True, type=str, help='Path to the metrics JSON file.') 72 | args = parser.parse_args() 73 | 74 | plot_metrics(args.file_path) 75 | -------------------------------------------------------------------------------- /src/metrics/file_consistency.py: -------------------------------------------------------------------------------- 1 | def metric_json_file_consistency(floorplan): 2 | return floorplan.validate_normal 3 | 4 | def metric_json_strict_file_consistency(floorplan): 5 | return floorplan.validate_strict -------------------------------------------------------------------------------- /src/metrics/prompt_consistency.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | def _compute_recall_precision(TP, FP, FN): 3 | if len(TP) + len(FN) == 0: 4 | return None 5 | precision = len(TP) / (len(TP) + len(FP)) if len(TP) + len(FP) > 0 else 1.0 6 | recall = len(TP) / (len(TP) + len(FN)) 7 | return precision, recall 8 | 9 | def _compute_TP_FP_FN(predicted_set, real_set): 10 | TP = predicted_set & real_set 11 | FP = predicted_set - real_set 12 | FN = real_set - predicted_set 13 | return TP, FP, FN 14 | 15 | def _compute_TP_FP_FN_lists(predicted_L, real_L): 16 | TP, FP, FN = [], [], [] 17 | real_L = deepcopy(real_L) 18 | for i, pred in enumerate(predicted_L): 19 | if pred in real_L: 20 | TP.append(pred) 21 | real_L.remove(pred) 22 | else: 23 | FP.append(pred) 24 | FN = real_L 25 | return TP, FP, FN 26 | 27 | def metric_num_room_prompt_consistency(floorplan, prompt_floorplan): 28 | try: 29 | prompt_room_count = prompt_floorplan.get_room_count() 30 | except KeyError: 31 | return None 32 | try: 33 | floorplan_room_count = floorplan.get_room_count() 34 | return abs(floorplan_room_count - prompt_room_count)/prompt_room_count 35 | except KeyError: 36 | return 1.0 37 | 38 | def metric_room_id_prompt_consistency(floorplan, prompt_floorplan): 39 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 40 | prompt_room_ids = prompt_floorplan.get_unmodified_room_ids() 41 | TP, FP, FN = _compute_TP_FP_FN(floorplan_room_ids, prompt_room_ids) 42 | return _compute_recall_precision(TP, FP, FN) 43 | 44 | def metric_room_area_prompt_consistency(floorplan, prompt_floorplan): 45 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 46 | prompt_room_ids = prompt_floorplan.get_unmodified_room_ids() 47 | 48 | buff = [] 49 | for room_id in floorplan_room_ids & prompt_room_ids: 50 | try: 51 | floorplan_room_area = floorplan.get_room_polygon_area(room_id)[0] 52 | prompt_room_area = prompt_floorplan.get_room_area(room_id) 53 | buff.append(abs(floorplan_room_area - prompt_room_area) / prompt_room_area) 54 | except: 55 | pass 56 | return sum(buff) / len(buff) if len(buff)>0 else None 57 | 58 | def metric_polygon_area_sum_vs_total_area_prompt_consistency(floorplan, prompt_floorplan): 59 | 60 | try: 61 | prompt_total_area = prompt_floorplan.get_total_area() 62 | except KeyError: 63 | return None 64 | 65 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 66 | polygon_total_area = 0.0 67 | for room_id in floorplan_room_ids: 68 | try: 69 | polygon_total_area += floorplan.get_room_polygon_area(room_id)[0] 70 | except: 71 | pass 72 | 73 | return abs(polygon_total_area - prompt_total_area) / prompt_total_area 74 | 75 | def metric_room_type_prompt_consistency(floorplan, prompt_floorplan): 76 | 77 | floorplan_room_types = floorplan.get_room_types() 78 | prompt_room_types = prompt_floorplan.get_room_types() 79 | 80 | TP, FP, FN = _compute_TP_FP_FN_lists(floorplan_room_types, prompt_room_types) 81 | return _compute_recall_precision(TP, FP, FN) 82 | 83 | def metric_room_id_type_match_prompt_consistency(floorplan, prompt_floorplan): 84 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 85 | prompt_room_ids = prompt_floorplan.get_unmodified_room_ids() 86 | 87 | buff, numel = 0, 0 88 | for room_id in floorplan_room_ids & prompt_room_ids: 89 | try: 90 | prompt_room_type = prompt_floorplan.get_room_type(room_id) 91 | except KeyError: 92 | continue 93 | numel += 1 94 | try: 95 | floorplan_room_type = floorplan.get_room_type(room_id) 96 | except KeyError: 97 | continue 98 | buff += 1 if floorplan_room_type == prompt_room_type else 0 99 | return buff / numel if numel > 0 else None 100 | 101 | def metric_room_height_prompt_consistency(floorplan, prompt_floorplan): 102 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 103 | prompt_room_ids = prompt_floorplan.get_unmodified_room_ids() 104 | 105 | buff = [] 106 | for room_id in floorplan_room_ids & prompt_room_ids: 107 | try: 108 | prompt_height = prompt_floorplan.get_room_height(room_id) 109 | polygon_height = floorplan.get_room_polygon(room_id).height 110 | buff.append(abs(polygon_height - prompt_height) / prompt_height) 111 | except: 112 | pass 113 | return sum(buff) / len(buff) if buff else None 114 | 115 | def metric_room_width_prompt_consistency(floorplan, prompt_floorplan): 116 | floorplan_room_ids = floorplan.get_unmodified_room_ids() 117 | prompt_room_ids = prompt_floorplan.get_unmodified_room_ids() 118 | 119 | buff = [] 120 | for room_id in floorplan_room_ids & prompt_room_ids: 121 | try: 122 | prompt_width = prompt_floorplan.get_room_width(room_id) 123 | polygon_width = floorplan.get_room_polygon(room_id).width 124 | buff.append(abs(polygon_width - prompt_width) / prompt_width) 125 | except: 126 | pass 127 | return sum(buff) / len(buff) if buff else None -------------------------------------------------------------------------------- /src/metrics/self_consistency.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | def metric_room_count_self_consistency(floorplan): 3 | try: 4 | return floorplan.get_room_count() == floorplan.get_num_rooms() 5 | except: 6 | return None 7 | 8 | def metric_room_id_self_consistency(floorplan): 9 | return len(floorplan.get_room_ids()) == floorplan.get_num_rooms() 10 | 11 | def metric_total_area_self_consistency(floorplan): 12 | room_ids = floorplan.get_room_ids() 13 | try: 14 | total_area = floorplan.get_total_area() 15 | except: 16 | return None 17 | area_diff = total_area 18 | for room_id in room_ids: 19 | try: 20 | area_diff -= floorplan.get_room_area(room_id) 21 | except: 22 | try: 23 | area_diff -= floorplan.get_room_polygon_area(room_id)[0] 24 | except: 25 | pass 26 | return abs(area_diff) / total_area 27 | 28 | def metric_polygon_area_self_consistency(floorplan): 29 | room_ids = floorplan.get_room_ids() 30 | area_scores = [] 31 | if_align_score = 0 # if area computed with sorted vertices is the same as the area computed with unsorted vertices 32 | num_valid_rooms = 0 33 | for room_id in room_ids: 34 | try: 35 | computed_area, if_align = floorplan.get_room_polygon_area(room_id) 36 | if_align_score += 1 if if_align else 0 37 | stated_area = floorplan.get_room_area(room_id) 38 | area_scores.append(abs(computed_area - stated_area) / stated_area) 39 | num_valid_rooms += 1 40 | except: 41 | pass 42 | 43 | return (sum(area_scores)/num_valid_rooms, if_align_score / num_valid_rooms) if num_valid_rooms>0 else None 44 | 45 | 46 | def metric_polygon_overlap_count_self_consistency(floorplan): 47 | return floorplan.count_room_overlaps() > 0 48 | 49 | def metric_polygon_containment_count_self_consistency(floorplan): 50 | raise NotImplementedError("Not implemented yet") 51 | 52 | def metric_room_height_self_consistency(floorplan): 53 | 54 | room_ids = set(floorplan.get_room_ids()) 55 | 56 | height_scores = [] 57 | for room_id in room_ids: 58 | try: 59 | stated_height = floorplan.get_room_height(room_id) 60 | polygon_height = floorplan.get_room_polygon(room_id).height 61 | height_scores.append(abs(polygon_height - stated_height) / stated_height) 62 | except: 63 | pass 64 | return sum(height_scores)/len(height_scores) if height_scores else None 65 | 66 | def metric_room_width_self_consistency(floorplan): 67 | room_ids = set(floorplan.get_room_ids()) 68 | 69 | width_scores = [] 70 | for room_id in room_ids: 71 | try: 72 | stated_width = floorplan.get_room_width(room_id) 73 | polygon_width = floorplan.get_room_polygon(room_id).width 74 | width_scores.append(abs(polygon_width - stated_width) / stated_width) 75 | except: 76 | pass 77 | return sum(width_scores)/len(width_scores) if width_scores else None -------------------------------------------------------------------------------- /src/pred/extract_output_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from ..utils import repair_json 3 | 4 | def extract_output_json(input_str: str): 5 | try: 6 | output_index = input_str.find("Output:") 7 | if output_index == -1: 8 | output_index = input_str.find("assistant") + len("assistant") 9 | else: 10 | output_index + len("Output:\n") 11 | if output_index == -1: 12 | return None 13 | 14 | output_str = input_str[output_index:] 15 | output_dict = json.loads(output_str) 16 | 17 | return output_dict 18 | except json.JSONDecodeError: 19 | try: 20 | json_repaired = repair_json(output_str, return_objects=True) 21 | if json_repaired != "": 22 | return json_repaired 23 | else: 24 | return {} 25 | except Exception: 26 | return {} 27 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .process_dataset import Floorplan, FloorplansAndPrompt 2 | from .eval_sample import FloorplansAndPromptEvaluation 3 | from .json_repair import * 4 | from .polygon_object import Polygon 5 | from .eval_overall import Evaluate 6 | from .util import natural_sort_key, list_folders, list_json_files 7 | from .plot import get_df_from_summary, plot_radar_from_df, plot_categories_sanity_check, \ 8 | get_df_from_summary_separated_by_num_rooms, plot_3d_from_df 9 | 10 | 11 | __all__ = ['Floorplan', 'FloorplansAndPrompt', 'FloorplansAndPromptEvaluation', 'Polygon', 'Evaluate', 12 | 'natural_sort_key', 'list_folders', 'list_json_files', 13 | 'repair_json', 'json_loads', 'json_load', 'json_from_file','plot_radar_from_df', 14 | 'plot_categories_sanity_check', 'get_df_from_summary', 'get_df_from_summary_separated_by_num_rooms', 15 | 'plot_3d_from_df'] -------------------------------------------------------------------------------- /src/utils/bubble_graph.py: -------------------------------------------------------------------------------- 1 | import json_repair 2 | import numpy as np 3 | 4 | def extract_polygon(json_str=None, json_dict=None): 5 | room_info = extract_room_info(json_str,json_dict) 6 | return [room['floor_polygon'] for room in room_info] 7 | 8 | def extract_room_info(json_str=None, json_dict=None): 9 | ''' 10 | extract polygon, room type and room id when they exist 11 | ''' 12 | rooms_info = [] 13 | if json_dict is None: 14 | json_dict = json_repair.loads(json_str) 15 | if 'rooms' not in json_dict.keys(): 16 | return None 17 | for room in json_dict['rooms']: 18 | room_d = {} 19 | if 'floor_polygon' in room.keys(): 20 | vertices = room['floor_polygon'] 21 | polygon = [] 22 | for vertix in vertices: 23 | if 'x' in vertix.keys() and 'z' in vertix.keys(): 24 | polygon.append([vertix['x'],vertix['z']]) 25 | room_d['floor_polygon'] = polygon 26 | if 'room_type' in room.keys(): 27 | room_d['room_type'] = room['room_type'] 28 | if 'id' in room.keys(): 29 | room_d['id'] = room['id'] 30 | rooms_info.append(room_d) 31 | return rooms_info 32 | 33 | def polygon2bbox(polygon): 34 | x_max, x_min, y_max, y_min = 0, np.inf, 0, np.inf 35 | for x,y in polygon: 36 | x_max = max(x_max,x) 37 | x_min = min(x_min,x) 38 | y_max = max(y_max,y) 39 | y_min = min(y_min,y) 40 | return (x_min, y_min, x_max, y_max) 41 | 42 | def bboxes2bubble(bboxes, th=9): 43 | ''' 44 | bboxes: list of xyxy definitions for each room 45 | ''' 46 | edges = [] 47 | for u in range(len(bboxes)): 48 | for v in range(u+1,len(bboxes)): 49 | if not collide2d(bboxes[u][:4],bboxes[v][:4],th=th): continue 50 | # uy0, ux0, uy1, ux1 = bboxes[u][:4] 51 | # vy0, vx0, vy1, vx1 = bboxes[v][:4] 52 | # uc = (uy0+uy1)/2,(ux0+ux1)/2 53 | # vc = (vy0+vy1)/2,(vx0+vx1)/2 54 | # if ux0 < vx0 and ux1 > vx1 and uy0 < vy0 and uy1 > vy1: 55 | # relation = 5 #'surrounding' 56 | # elif ux0 >= vx0 and ux1 <= vx1 and uy0 >= vy0 and uy1 <= vy1: 57 | # relation = 4 #'inside' 58 | # else: 59 | # relation = point_box_relation(uc,bboxes[v,:4]) 60 | # edges.append([u,v,relation]) 61 | edges.append([u,v]) 62 | 63 | edges = np.array(edges,dtype=int) 64 | return edges 65 | 66 | def collide2d(bbox1, bbox2, th=0): 67 | return not( 68 | (bbox1[0]-th > bbox2[2]) or 69 | (bbox1[2]+th < bbox2[0]) or 70 | (bbox1[1]-th > bbox2[3]) or 71 | (bbox1[3]+th < bbox2[1]) 72 | ) 73 | 74 | 75 | def get_edit_distance(g1,g2,g1_dict,g2_dict): 76 | ''' 77 | g1: graph 1 -- defined by pairs of connected nodes 78 | g2: graph 2 79 | g1_dict: dictionary containing info on nodes of g1 80 | g1_dict['node2room'] = list of room names where idx is room idx 81 | g1_dict['node2id'] = list of room idx to 'id' 82 | ''' 83 | pass 84 | 85 | def procthor2bubble(version=7): 86 | from datasets import load_from_disk 87 | from datasets import Dataset, DatasetDict 88 | ds_path = f'/network/scratch/l/luozhiha/datasets/procthor_data:v{version}' 89 | dataset = load_from_disk(ds_path) 90 | modified_data = {} 91 | for split in ['train','validation','test']: 92 | modified_split = [] 93 | dset = dataset[split] 94 | for idx, data in enumerate(dset): 95 | print(f'{split}: {idx}') 96 | room_info = extract_room_info(json_dict = data) 97 | polygons = [room['floor_polygon'] for room in room_info] 98 | bboxes = [polygon2bbox(pg) for pg in polygons] 99 | edges = bboxes2bubble(bboxes,th=2) 100 | data['edges'] = edges.tolist() 101 | modified_split.append(data) 102 | modified_data[split] = Dataset.from_list(modified_split) 103 | modified_data = DatasetDict(modified_data) 104 | version = 8 105 | ds_path = f'/network/scratch/l/luozhiha/datasets/procthor_data:v{version}' 106 | modified_data.save_to_disk(ds_path) 107 | import pdb; pdb.set_trace() 108 | 109 | if __name__ == '__main__': 110 | procthor2bubble(version=7) -------------------------------------------------------------------------------- /src/utils/fp_plot/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from . import procthorpy 3 | # import rplanpy 4 | 5 | # def plot_rplan(file: str, out_file: str = 'output_graph.png', plot_graph: bool = False) -> None: 6 | # data = rplanpy.data.RplanData(file) 7 | # ncols = 2 if plot_graph else 1 8 | # _fig, ax = plt.subplots(nrows=1, ncols=ncols, figsize=(ncols*5, 5)) 9 | 10 | # if plot_graph: 11 | # rplanpy.plot.plot_floorplan(data, ax=ax[0], title="Rooms") 12 | # rplanpy.plot.plot_floorplan_graph( 13 | # data=data, with_colors=True, edge_label='door', ax=ax[1], 14 | # title="Bubble graph" 15 | # ) 16 | # else: 17 | # rplanpy.plot.plot_floorplan(data, ax=ax, title="Rooms") 18 | 19 | # plt.tight_layout() 20 | # plt.savefig(out_file) 21 | # plt.show() 22 | 23 | def plot_procthor(data, out_file: str = 'output_procthor.png') -> None: 24 | data = data["rooms"] 25 | ncols = 1 26 | _fig, ax = plt.subplots(nrows=1, ncols=ncols, figsize=(ncols*5, 5)) 27 | 28 | procthorpy.plot.plot_floorplan(data, ax=ax, title=None, label_rooms=False) 29 | 30 | plt.tight_layout() 31 | plt.savefig(out_file,bbox_inches='tight', transparent=True) 32 | plt.clf() 33 | plt.close() 34 | # plt.show() -------------------------------------------------------------------------------- /src/utils/fp_plot/procthorpy/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | from . import plot 3 | -------------------------------------------------------------------------------- /src/utils/fp_plot/procthorpy/plot.py: -------------------------------------------------------------------------------- 1 | from matplotlib import patches 2 | import matplotlib.pyplot as plt 3 | import networkx as nx 4 | import numpy as np 5 | from .utils import ROOM_COLOR, ROOM_TYPE 6 | 7 | 8 | def floorplan_to_color(data): 9 | room_colors = [] 10 | for room in data: 11 | room_type_key = next(key for key, value in ROOM_TYPE.items() if value == room['room_type']) 12 | color = ROOM_COLOR[room_type_key] 13 | room_colors.append((room['floor_polygon'], color, room['room_type'])) 14 | return room_colors 15 | 16 | 17 | def plot_floorplan(data, ax=None, title=None, wall_thickness=0.4, label_rooms=False): 18 | room_colors = floorplan_to_color(data) 19 | 20 | for polygon, color, room_type in room_colors: 21 | polygon_points = [(point['x'], point['z']) for point in polygon] 22 | color_normalized = [c / 255.0 for c in color] 23 | 24 | # Draw room 25 | polygon_shape = patches.Polygon(polygon_points, closed=True, edgecolor='black', facecolor=color_normalized, linewidth=2) 26 | ax.add_patch(polygon_shape) 27 | 28 | # # Draw walls 29 | for i in range(len(polygon_points)): 30 | start_point = polygon_points[i] 31 | end_point = polygon_points[(i + 1) % len(polygon_points)] 32 | ax.plot([start_point[0], end_point[0]], [start_point[1], end_point[1]], color=[c / 255.0 for c in ROOM_COLOR[14]], linewidth=wall_thickness * 10) 33 | 34 | # Room label 35 | if label_rooms: 36 | centroid = np.mean(polygon_points, axis=0) 37 | ax.text(centroid[0], centroid[1], room_type, ha='center', va='center', fontsize=6, weight='bold', color='black') 38 | 39 | ax.set_xlim(-1, max(p['x'] for room in data for p in room['floor_polygon']) + 1) 40 | ax.set_ylim(-1, max(p['z'] for room in data for p in room['floor_polygon']) + 1) 41 | ax.set_aspect('equal', adjustable='box') 42 | ax.axis('off') 43 | 44 | if title: 45 | ax.set_title(title) 46 | 47 | return ax 48 | -------------------------------------------------------------------------------- /src/utils/fp_plot/procthorpy/utils.py: -------------------------------------------------------------------------------- 1 | ROOM_TYPE = { 2 | 0: "LivingRoom", 3 | 1: "Bedroom", 4 | 2: "Kitchen", 5 | 3: "Bathroom", 6 | 4: 'MasterRoom', 7 | 5: 'DiningRoom', 8 | 6: 'ChildRoom', 9 | 7: 'StudyRoom', 10 | 8: 'SecondRoom', 11 | 9: 'GuestRoom', 12 | 10: 'Balcony', 13 | 11: 'Entrance', 14 | 12: 'Storage', 15 | } 16 | 17 | ROOM_COLOR = { 18 | 0: [244, 242, 229], 19 | 1: [253, 244, 171], 20 | 2: [234, 216, 214], 21 | 3: [205, 233, 252], 22 | 4: [244, 242, 229], 23 | 5: [253, 244, 171], 24 | 6: [253, 244, 171], 25 | 7: [253, 244, 171], 26 | 8: [253, 244, 171], 27 | 9: [208, 216, 135], 28 | 10: [244, 242, 229], 29 | 11: [249, 222, 189], 30 | 12: [128, 128, 128], 31 | 13: [255, 255, 255], 32 | 14: [79, 79, 79], 33 | 15: [255, 225, 25], 34 | 16: [128, 128, 128], 35 | 17: [255, 225, 25], 36 | } -------------------------------------------------------------------------------- /src/utils/json_check/__init__.py: -------------------------------------------------------------------------------- 1 | from .schema import schema, strict_schema 2 | from .verify import is_valid_json 3 | 4 | __all__ = ["schema", "strict_schema", "is_valid_json"] -------------------------------------------------------------------------------- /src/utils/json_check/schema.py: -------------------------------------------------------------------------------- 1 | strict_schema = { 2 | "type": "object", 3 | "properties": { 4 | "room_count": { 5 | "type": "integer" 6 | }, 7 | "total_area": { 8 | "type": "number" 9 | }, 10 | "room_types": { 11 | "type": "array", 12 | "items": { 13 | "type": "string" 14 | } 15 | }, 16 | "rooms": { 17 | "type": "array", 18 | "items": { 19 | "type": "object", 20 | "properties": { 21 | "id": { 22 | "type": "string" 23 | }, 24 | "room_type": { 25 | "type": "string" 26 | }, 27 | "area": { 28 | "type": "number" 29 | }, 30 | "width": { 31 | "type": "number" 32 | }, 33 | "height": { 34 | "type": "number" 35 | }, 36 | "is_regular": { 37 | "type": "integer" 38 | }, 39 | "floor_polygon": { 40 | "type": "array", 41 | "items": { 42 | "type": "object", 43 | "properties": { 44 | "x": { 45 | "type": "number" 46 | }, 47 | "z": { 48 | "type": "number" 49 | } 50 | }, 51 | "required": ["x", "z"] 52 | } 53 | } 54 | }, 55 | "required": ["id", "room_type", "area", "width", "height", "is_regular", "floor_polygon"] 56 | } 57 | }, 58 | }, 59 | "required": ["room_count", "total_area", "room_types", "rooms"] 60 | } 61 | 62 | schema = { 63 | "type": "object", 64 | "properties": { 65 | "room_count": { 66 | "type": "integer" 67 | }, 68 | "total_area": { 69 | "type": "number" 70 | }, 71 | "room_types": { 72 | "type": "array", 73 | "items": { 74 | "type": "string" 75 | } 76 | }, 77 | "rooms": { 78 | "type": "array", 79 | "items": { 80 | "type": "object", 81 | "properties": { 82 | "id": { 83 | "type": "string" 84 | }, 85 | "room_type": { 86 | "type": "string" 87 | }, 88 | "area": { 89 | "type": "number" 90 | }, 91 | "width": { 92 | "type": "number" 93 | }, 94 | "height": { 95 | "type": "number" 96 | }, 97 | "is_regular": { 98 | "type": "integer" 99 | }, 100 | "floor_polygon": { 101 | "type": "array", 102 | "items": { 103 | "type": "object", 104 | "properties": { 105 | "x": { 106 | "type": "number" 107 | }, 108 | "y": { 109 | "type": "number" 110 | }, 111 | "z": { 112 | "type": "number" 113 | } 114 | }, 115 | "anyOf": [ 116 | {"required": ["x", "z"]}, 117 | {"required": ["x", "y"]} 118 | ] 119 | } 120 | } 121 | } 122 | } 123 | }, 124 | "doors": { 125 | "type": "array", 126 | "items": { 127 | "type": "object", 128 | "properties": { 129 | "id": { 130 | "type": "string" 131 | }, 132 | "position": { 133 | "type": "array", 134 | "items": { 135 | "type": "object", 136 | "properties": { 137 | "x": { 138 | "type": "number" 139 | }, 140 | "y": { 141 | "type": "number" 142 | }, 143 | "z": { 144 | "type": "number" 145 | } 146 | }, 147 | "anyOf": [ 148 | {"required": ["x", "z"]}, 149 | {"required": ["x", "y"]} 150 | ] 151 | } 152 | } 153 | }, 154 | "required": ["id", "position"] 155 | } 156 | }, 157 | "windows": { 158 | "type": "array", 159 | "items": { 160 | "type": "object", 161 | "properties": { 162 | "id": { 163 | "type": "string" 164 | }, 165 | "position": { 166 | "type": "array", 167 | "items": { 168 | "type": "object", 169 | "properties": { 170 | "x": { 171 | "type": "number" 172 | }, 173 | "y": { 174 | "type": "number" 175 | }, 176 | "z": { 177 | "type": "number" 178 | } 179 | }, 180 | "anyOf": [ 181 | {"required": ["x", "z"]}, 182 | {"required": ["x", "y"]} 183 | ] 184 | } 185 | } 186 | }, 187 | "required": ["id", "position"] 188 | } 189 | } 190 | } 191 | } -------------------------------------------------------------------------------- /src/utils/json_check/verify.py: -------------------------------------------------------------------------------- 1 | from jsonschema import validate 2 | from jsonschema.exceptions import ValidationError 3 | from .schema import schema, strict_schema 4 | 5 | def is_valid_json(json_data, strict=False): 6 | _schema = strict_schema if strict else schema 7 | try: 8 | validate(json_data, _schema) 9 | return True 10 | except ValidationError as e: 11 | return False -------------------------------------------------------------------------------- /src/utils/polygon_object.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def line_intersect(vertex1, vertex2, vertex3, vertex4): 4 | # check if two lines intersect 5 | def ccw(A, B, C): 6 | return (C['z']-A['z']) * (B['x']-A['x']) >= (B['z']-A['z']) * (C['x']-A['x']) 7 | A, B, C, D = vertex1, vertex2, vertex3, vertex4 8 | return ccw(A, C, D) != ccw(B, C, D) and ccw(A, B, C) != ccw(A, B, D) 9 | 10 | # def line_intersect_dumb(p1,p2,q1,q2): 11 | # xp_min = min(p1['x'],p2['x']) 12 | # xp_max = max(p1['x'],p2['x']) 13 | # xq_min = min(q1['x'],q2['x']) 14 | # xq_max = max(q1['x'],q2['x']) 15 | # zp_min = min(p1['z'],p2['z']) 16 | # zp_max = max(p1['z'],p2['z']) 17 | # zq_min = min(q1['z'],q2['z']) 18 | # zq_max = max(q1['z'],q2['z']) 19 | 20 | # x0_min = max(xp_min,xq_min) 21 | # x0_max = min(xp_max,xq_max) 22 | 23 | # if x0_max <= x0_min: 24 | # return False 25 | 26 | # z0_min = max(zp_min,zq_min) 27 | # z0_max = min(zp_max,zq_max) 28 | 29 | # if z0_max <= z0_min: 30 | # return False 31 | 32 | # rise1 = p2['z'] - p1['z'] 33 | # rise2 = q2['z'] - q1['z'] 34 | # run1 = p2['x'] - p1['z'] 35 | # run2 = q2['x'] - q1['z'] 36 | 37 | # if run1 == 0 and run2 ==0: 38 | # return False 39 | 40 | # m1, m2 = None, None 41 | # if run1 != 0: 42 | # m1 = rise1/run1 43 | # if run2 != 0: 44 | # m2 = rise2/run2 45 | # if m1==m2: 46 | # return False 47 | 48 | class Polygon: 49 | def __init__(self, vertices, scaling_factor=18/256): 50 | self.scaling_factor = scaling_factor 51 | self.set_vertices(vertices) 52 | self.edges = self.get_edges() 53 | self.unsorted_area = self.calculate_polygon_area(self.vertices) 54 | self.sorted_area = self.calculate_polygon_area(self.sorted_vertices) 55 | self.width = self.max_x - self.min_x 56 | self.height = self.max_y - self.min_y 57 | 58 | def set_vertices(self, vertices): 59 | for vertex in vertices: 60 | vertex['x'] *= self.scaling_factor 61 | vertex['z'] *= self.scaling_factor 62 | self.vertices = vertices 63 | self.sorted_vertices = self.get_sorted_vertices() 64 | 65 | def get_edges(self): 66 | """ Generate edges by creating pairs of points """ 67 | return [(self.vertices[i], self.vertices[(i + 1) % len(self.vertices)]) for i in range(len(self.vertices))] 68 | # return [(self.sorted_vertices[i], self.sorted_vertices[(i + 1) % len(self.sorted_vertices)]) for i in range(len(self.sorted_vertices))] 69 | 70 | def get_sorted_vertices(self): 71 | def get_midpoint(vertices): 72 | sum_x, sum_z = 0, 0 73 | min_x, max_x, min_y, max_y = float('inf'), -float('inf'), float('inf'), -float('inf') 74 | for vertex in vertices: 75 | sum_x += vertex['x'] 76 | sum_z += vertex['z'] 77 | min_x, max_x = min(min_x, vertex['x']), max(max_x, vertex['x']) 78 | min_y, max_y = min(min_y, vertex['z']), max(max_y, vertex['z']) 79 | return (sum_x/len(vertices), sum_z/len(vertices)), (min_x, max_x, min_y, max_y) 80 | 81 | def get_slope_from_mid_point(vertices): 82 | (mid_x, mid_z), (min_x, max_x, min_y, max_y) = get_midpoint(vertices) 83 | ret = [] 84 | for vertex in vertices: 85 | ret.append((vertex, math.atan2(vertex['x'] - mid_x, vertex['z'] - mid_z))) 86 | return ret, (min_x, max_x, min_y, max_y) 87 | 88 | vertices_with_slopes, (self.min_x, self.max_x, self.min_y, self.max_y) = get_slope_from_mid_point(self.vertices) 89 | vertices_with_slopes = sorted(vertices_with_slopes, key=lambda x: x[1]) 90 | return [vertex[0] for vertex in vertices_with_slopes] 91 | 92 | def calculate_polygon_area(self, vertices, decimals=1): # shoelace formula 93 | n = len(vertices) 94 | area = 0 95 | for i in range(n): 96 | j = (i + 1) % n 97 | area += vertices[i]['x'] * vertices[j]['z'] 98 | area -= vertices[j]['x'] * vertices[i]['z'] 99 | area = abs(area) / 2.0 100 | return round(area, decimals) 101 | 102 | def surround(self, other): 103 | # TODO 104 | pass 105 | 106 | def overlap(self, other): 107 | 108 | for edge1 in self.edges: 109 | for edge2 in other.edges: 110 | if line_intersect(edge1[0], edge1[1], edge2[0], edge2[1]): 111 | return True 112 | return False 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/utils/util.py: -------------------------------------------------------------------------------- 1 | def natural_sort_key(s): 2 | import re 3 | return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)] 4 | 5 | def list_folders(directory, use_natural_sort=True): 6 | import os 7 | folders = [folder for folder in os.listdir(directory) if os.path.isdir(os.path.join(directory, folder))] 8 | if use_natural_sort: 9 | folders.sort(key=natural_sort_key) 10 | else: 11 | folders.sort(key=str.lower) 12 | return folders 13 | 14 | def list_json_files(directory): 15 | import os 16 | json_files = [file for file in os.listdir(directory) if file.endswith('.json') and os.path.isfile(os.path.join(directory, file))] 17 | json_files.sort(key=natural_sort_key) 18 | return json_files --------------------------------------------------------------------------------