├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── MODEL_CARD.md ├── README.md ├── apps ├── flask │ ├── requirements.txt │ ├── web_server.py │ └── web_server_single.py └── gradio │ ├── run.sh │ ├── set_up_venv.sh │ ├── webapp.py │ └── webapp_single.py ├── dataset └── alpaca_data.json ├── docs ├── README.md ├── download.png ├── llama_hf.md ├── llama_inference.png ├── llama_multigpu.png ├── llama_profiling.png ├── llama_webui.png ├── pyllama_7B_3GB.png └── pyllama_7B_6GB.png ├── download.sh ├── example.py ├── inference.py ├── inference_driver.py ├── llama ├── __init__.py ├── convert_llama.py ├── download.py ├── download_community.sh ├── download_community_stop.sh ├── generation.py ├── hf │ ├── __init__.py │ ├── configuration_llama.py │ ├── modeling_llama.py │ ├── tokenization_llama.py │ └── utils.py ├── llama_infer.py ├── llama_multigpu.py ├── llama_quant.py ├── model_parallel.py ├── model_single.py ├── tokenizer.py └── version.py ├── quant_infer.py ├── requirements-quant.txt ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to LLaMA 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to LLaMA, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements*.txt 2 | include README.md 3 | include llama/download_community.sh 4 | -------------------------------------------------------------------------------- /MODEL_CARD.md: -------------------------------------------------------------------------------- 1 | # LLama Model Card 2 | 3 | ## Model details 4 | **Organization developing the model** 5 | The FAIR team of Meta AI. 6 | 7 | **Model date** 8 | LLaMA was trained between December. 2022 and Feb. 2023. 9 | 10 | **Model version** 11 | This is version 1 of the model. 12 | 13 | **Model type** 14 | LLaMA is an auto-regressive language model, based on the transformer architecture. The model comes in different sizes: 7B, 13B, 33B and 65B parameters. 15 | 16 | **Paper or resources for more information** 17 | More information can be found in the paper “LLaMA, Open and Efficient Foundation Language Models”, available at https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/. 18 | 19 | **Citations details** 20 | https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/ 21 | 22 | **License** 23 | Non-commercial bespoke license 24 | 25 | **Where to send questions or comments about the model** 26 | Questions and comments about LLaMA can be sent via the [GitHub repository](https://github.com/facebookresearch/llama) of the project , by opening an issue. 27 | 28 | ## Intended use 29 | **Primary intended uses** 30 | The primary use of LLaMA is research on large language models, including: 31 | exploring potential applications such as question answering, natural language understanding or reading comprehension, 32 | understanding capabilities and limitations of current language models, and developing techniques to improve those, 33 | evaluating and mitigating biases, risks, toxic and harmful content generations, hallucinations. 34 | 35 | **Primary intended users** 36 | The primary intended users of the model are researchers in natural language processing, machine learning and artificial intelligence. 37 | 38 | **Out-of-scope use cases** 39 | LLaMA is a base, or foundational, model. As such, it should not be used on downstream applications without further risk evaluation and mitigation. In particular, our model has not been trained with human feedback, and can thus generate toxic or offensive content, incorrect information or generally unhelpful answers. 40 | 41 | ## Factors 42 | **Relevant factors** 43 | One of the most relevant factors for which model performance may vary is which language is used. Although we included 20 languages in the training data, most of our dataset is made of English text, and we thus expect the model to perform better for English than other languages. Relatedly, it has been shown in previous studies that performance might vary for different dialects, and we expect that it will be the case for our model. 44 | 45 | **Evaluation factors** 46 | As our model is trained on data from the Web, we expect that it reflects biases from this source. We thus evaluated on RAI datasets to measure biases exhibited by the model for gender, religion, race, sexual orientation, age, nationality, disability, physical appearance and socio-economic status. We also measure the toxicity of model generations, depending on the toxicity of the context used to prompt the model. 47 | 48 | ## Metrics 49 | **Model performance measures** 50 | We use the following measure to evaluate the model: 51 | - Accuracy for common sense reasoning, reading comprehension, natural language understanding (MMLU), BIG-bench hard, WinoGender and CrowS-Pairs, 52 | - Exact match for question answering, 53 | - The toxicity score from Perspective API on RealToxicityPrompts. 54 | 55 | **Decision thresholds** 56 | Not applicable. 57 | 58 | **Approaches to uncertainty and variability** 59 | Due to the high computational requirements of training LLMs, we trained only one model of each size, and thus could not evaluate variability of pre-training. 60 | 61 | ## Evaluation datasets 62 | The model was evaluated on the following benchmarks: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC, OpenBookQA, NaturalQuestions, TriviaQA, RACE, MMLU, BIG-bench hard, GSM8k, RealToxicityPrompts, WinoGender, CrowS-Pairs. 63 | 64 | ## Training dataset 65 | The model was trained using the following source of data: CCNet [67%], C4 [15%], GitHub [4.5%], Wikipedia [4.5%], Books [4.5%], ArXiv [2.5%], Stack Exchange[2%]. The Wikipedia and Books domains include data in the following languages: bg, ca, cs, da, de, en, es, fr, hr, hu, it, nl, pl, pt, ro, ru, sl, sr, sv, uk. See the paper for more details about the training set and corresponding preprocessing. 66 | 67 | ## Quantitative analysis 68 | Hyperparameters for the model architecture 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 84 | 85 | 87 | 88 | 90 | 91 | 93 | 94 |
LLaMa Model hyper parameters
Number of parametersdimensionn headsn layersLearn rateBatch sizen tokens
7B 4096 32 32 3.0E-044M1T 83 |
13B512040403.0E-044M1T 86 |
33B665652601.5.E-044M1.4T 89 |
65B819264801.5.E-044M1.4T 92 |
95 | 96 | 97 | *Table 1 - Summary of LLama Model Hyperparameters* 98 | 99 | We present our results on eight standard common sense reasoning benchmarks in the table below. 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 113 | 115 | 117 | 118 | 119 |
LLaMa Reasoning tasks
Number of parameters BoolQPIQASIQAHellaSwagWinoGrandeARC-eARC-cOBQACOPA
7B76.579.848.976.170.176.747.657.293 112 |
13B78.180.150.479.27378.152.756.494 114 |
33B83.182.350.482.87681.457.858.692 116 |
65B85.382.852.384.27781.55660.294
120 | 121 | *Table 2 - Summary of LLama Model Performance on Reasoning tasks* 122 | 123 | 124 | We present our results on bias in the table below. Note that lower value is better indicating lower bias. 125 | 126 | 127 | | No | Category | FAIR LLM | 128 | | --- | -------------------- | -------- | 129 | | 1 | Gender | 70.6 | 130 | | 2 | Religion | 79 | 131 | | 3 | Race/Color | 57 | 132 | | 4 | Sexual orientation | 81 | 133 | | 5 | Age | 70.1 | 134 | | 6 | Nationality | 64.2 | 135 | | 7 | Disability | 66.7 | 136 | | 8 | Physical appearance | 77.8 | 137 | | 9 | Socioeconomic status | 71.5 | 138 | | | LLaMA Average | 66.6 | 139 | 140 | *Table 3 - Summary bias of our model output* 141 | 142 | 143 | 144 | ## Ethical considerations 145 | **Data** 146 | The data used to train the model is collected from various sources, mostly from the Web. As such, it contains offensive, harmful and biased content. We thus expect the model to exhibit such biases from the training data. 147 | 148 | **Human life** 149 | The model is not intended to inform decisions about matters central to human life, and should not be used in such a way. 150 | 151 | **Mitigations** 152 | We filtered the data from the Web based on its proximity to Wikipedia text and references. For this, we used a Kneser-Ney language model and a fastText linear classifier. 153 | 154 | **Risks and harms** 155 | Risks and harms of large language models include the generation of harmful, offensive or biased content. These models are often prone to generating incorrect information, sometimes referred to as hallucinations. We do not expect our model to be an exception in this regard. 156 | 157 | **Use cases** 158 | LLaMA is a foundational model, and as such, it should not be used for downstream applications without further investigation and mitigations of risks. These risks and potential fraught use cases include, but are not limited to: generation of misinformation and generation of harmful, biased or offensive content. 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🦙 LLaMA - Run LLM in A Single 4GB GPU 2 | 3 | 4 | > 📢 `pyllama` is a hacked version of `LLaMA` based on original Facebook's implementation but more convenient to run in a Single consumer grade GPU. 5 | 6 | > The Hugging Face's LLaMA implementation is available at `pyllama.hf`. 7 | 8 | ## 📥 Installation 9 | 10 | In a conda env with pytorch / cuda available, run: 11 | ``` 12 | pip install pyllama -U 13 | ``` 14 | 15 | > 🐏 If you have installed llama library from other sources, please uninstall the previous llama library and use `pip install pyllama -U` to install the latest version. 16 | 17 | 18 | ## 📦 Download Model Files 19 | 20 | ### 🧘‍♀️ Official Way 21 | 22 | In order to download the checkpoints and tokenizer, fill this [google form](https://forms.gle/jk851eBVbX1m5TAv5) 23 | 24 | Once your request is approved, you will receive links to download the tokenizer and model files. 25 | Edit the `download.sh` script with the signed url provided in the email to download the model weights and tokenizer. 26 | 27 | ### 🐒 Community Way 28 | 29 | - 1. pyllama 30 | 31 | There is another high-speed way to download the checkpoints and tokenizers. There are four models(7B,13B,30B,65B) available. To download all of them, run: 32 | 33 | ```bash 34 | python -m llama.download 35 | ``` 36 | 37 | To download only the 7B model files to your current directory, run: 38 | 39 | ```bash 40 | python -m llama.download --model_size 7B 41 | ``` 42 | 43 | To download only the 7B and 30B model files to folder `/tmp/pyllama_data`, run: 44 | 45 | ```bash 46 | python -m llama.download --model_size 7B,30B --folder /tmp/pyllama_data 47 | ``` 48 | 49 | The help doc is: 50 | ```bash 51 | $python -m llama.download --help 52 | usage: download.py [-h] [--model_size MODEL_SIZE] [--folder FOLDER] 53 | 54 | optional arguments: 55 | -h, --help show this help message and exit 56 | --model_size MODEL_SIZE 57 | The size of the models that you want to download. A comma separated 58 | string of any of "7B", "13B", "30B", "65B". Totally 219G disk space 59 | is needed to download them all. If you only want to download the 7B 60 | model, just put "7B" here. 61 | --folder FOLDER The target folder for the download files 62 | ``` 63 | 64 | - Sample Screenshot 65 | 66 | ![](docs/download.png) 67 | 68 | - 2. Bittorrent 69 | 70 | 🔥 In order to download the checkpoints and tokenizer, use this BitTorrent link: "[magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA](magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA)". 71 | 72 | 73 | ## 💎 Quantize LLaMA to run in a 4GB GPU 74 | 75 | `pyllama` support quantization of 2/3/4/8-bit so that you can run model in a 4G memory GPU. 76 | 77 | > You need to run `export HUGGING_FACE_HUB_TOKEN=XXX` to be able to access Hugging Face's data. You also need to install [gptq](https://pypi.org/project/gptq/) with command `pip install gptq`. 78 | 79 | ```bash 80 | python -m llama.llama_quant --help 81 | usage: llama_quant.py [-h] [--ckpt_dir CKPT_DIR] [--tokenizer_path TOKENIZER_PATH] 82 | [--seed SEED] [--nsamples NSAMPLES] [--percdamp PERCDAMP] 83 | [--nearest] [--wbits {2,3,4,8,16}] [--groupsize GROUPSIZE] 84 | [--save SAVE] [--load LOAD] [--benchmark BENCHMARK] [--check] 85 | [--cuda CUDA] [--eval] 86 | {wikitext2,ptb,c4} 87 | 88 | positional arguments: 89 | {wikitext2,ptb,c4} Where to extract calibration data from. 90 | 91 | optional arguments: 92 | -h, --help show this help message and exit 93 | --ckpt_dir CKPT_DIR 94 | --tokenizer_path TOKENIZER_PATH 95 | --seed SEED Seed for sampling the calibration data. 96 | --nsamples NSAMPLES Number of calibration data samples. 97 | --percdamp PERCDAMP Percent of the average Hessian diagonal to use for dampening. 98 | --nearest Whether to run the RTN baseline. 99 | --wbits {2,3,4,8} bits for quantization 100 | --groupsize GROUPSIZE 101 | Groupsize to use for quantization; default uses full row. 102 | --save SAVE Save quantized checkpoint under this name, eg pyllama-7B4b.pt. 103 | --load LOAD Load quantized model. 104 | --benchmark BENCHMARK 105 | Number of tokens to use for benchmarking. 106 | --check Whether to compute perplexity during benchmarking for verification. 107 | --cuda CUDA GPU device string, 'cuda:0' by default. 108 | --eval Evaluate the model with dataset wikitext2, ptb and c4 109 | ``` 110 | 111 | - Quantize 7B model to 8-bit 112 | 113 | ```bash 114 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 8 --save pyllama-7B8b.pt 115 | ``` 116 | 117 | - Quantize 7B model to 4-bit with groupsize 128 (the recommended setup 🔥) 118 | 119 | ```bash 120 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 4 --groupsize 128 --save pyllama-7B4b.pt 121 | ``` 122 | 123 | - Quantize 7B model to 2-bit 124 | 125 | ```bash 126 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 2 --save pyllama-7B2b.pt 127 | ``` 128 | 129 | The download links for quantized LLaMA files are below: 130 | 131 | - 7B 132 | 133 | | Quant Type | Size | Link | MD5 |Loss | Password | 134 | |----------|:-------------:|------:|------:|------:|--:| 135 | | 2-bit | 2160484475 | [🔗](https://pan.baidu.com/s/1zOdKOHnSCsz6TFix2NTFtg) | 4c7215d28c1f650218c43fc46402cec5|- | 8g9d | 136 | | 3-bit | - | - | -|- |-| 137 | | 4-bit | 3779485819 | - | cce9a3b522ddf5c011ee0174b2ff3dfb|- |-| 138 | | 8-bit | 7017493231 | - | 2648b09597cf8f9e0d1a04cb70b71cab|- |-| 139 | 140 | 141 | It took me 2 hours 40 mins to quantize the 65B model to 4bit. The file size is reduced from 122GB to 32GB. 142 | 143 | > The following suggestions are recommended for LLM quantization: 144 | > 1. By default, use 4-bit quantization for LLM inference as it offers the total model bits and zero-shot accuracy trade-offs. 145 | > 2. Use a block size of 128 or lower to stabilize 4-bit quantization and improve zero-shot performance. 146 | > 3. Use a floating point or quantile quantization data type. In some cases, integer data types might be preferable to improve inference latency depending on the implementation and hardware support. 147 | 148 | ## 🔮 Single GPU Inference 149 | 150 | ### 🥥 Without Quantization 151 | 152 | Set the environment variables `CKPT_DIR` as your llama model folder, for example `/llama_data/7B`, and `TOKENIZER_PATH` as your tokenizer's path, such as `/llama_data/tokenizer.model`. 153 | 154 | And then run the following command: 155 | 156 | ```bash 157 | python inference.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH 158 | ``` 159 | 160 | The following is an example of LLaMA running in a 8GB single GPU. 161 | 162 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_inference.png) 163 | 164 | ### 🥝 With Quantization 165 | 166 | With quantization, you can run LLaMA with a 4GB memory GPU. 167 | 168 | - pyllama can run 7B model with 6GB GPU memory. 169 | Example: ```python quant_infer.py --wbits 4 --load pyllama-7B4b.pt -- text "..." --max_length 24 --cuda cuda:0``` 170 | 171 | ![4bit-quant-6GB](https://github.com/juncongmoo/pyllama/blob/main/docs/pyllama_7B_6GB.png) 172 | 173 | - pyllama can run 7B model with 3.2GB GPU memory. 174 | Example: ```python quant_infer.py --wbits 2 --load pyllama-7B4b.pt -- text "..." --max_length 32``` 175 | 176 | ![2bit-quant-6GB](https://github.com/juncongmoo/pyllama/blob/main/docs/pyllama_7B_3GB.png) 177 | 178 | ### 💡 Tips 179 | 180 | - To load KV cache in CPU, run `export KV_CAHCHE_IN_GPU=0` in the shell. 181 | 182 | - To profile CPU/GPU/Latency, run: 183 | 184 | ```bash 185 | python inference_driver.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH 186 | ``` 187 | 188 | A sample result is like: 189 | 190 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_profiling.png) 191 | 192 | - Tune `max_seq_len` and `max_batch_size` to reduce memory consumption to be able to run in GPU. Refer to: [this post](https://github.com/juncongmoo/pyllama/issues/9)! 193 | 194 | ### 🍉 Start a gradio webui 195 | 196 | 197 | ```bash 198 | $ cd apps/gradio 199 | $ python webapp_single.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH 200 | ``` 201 | 202 | You should see something like this in your browser: 203 | 204 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_webui.png) 205 | 206 | ### 🍓 Start a web server 207 | 208 | The following command will start a flask web server: 209 | 210 | ```bash 211 | $ cd apps/flask 212 | $ python web_server_single.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH 213 | ``` 214 | 215 | ## 🍒 Multiple GPU Inference 216 | 217 | ### 🧘‍♀️ Official Way 218 | 219 | To use the original META's model parallel, please set environment variable `PYLLAMA_META_MP` like: 220 | 221 | ``` 222 | export PYLLAMA_META_MP=1 223 | ``` 224 | 225 | With this environment variable set, you can `import llama` and the original META version's llama will be imported. 226 | 227 | The provided `example.py` can be run on a single or multi-gpu node with `torchrun` and will output completions for two pre-defined prompts. Using `TARGET_FOLDER` as defined in `download.sh`: 228 | 229 | ```bash 230 | torchrun --nproc_per_node MP example.py --ckpt_dir $TARGET_FOLDER/model_size \ 231 | --tokenizer_path $TARGET_FOLDER/tokenizer.model 232 | ``` 233 | 234 | Different models require different MP values: 235 | 236 | | Model | MP | 237 | |--------|----| 238 | | 7B | 1 | 239 | | 13B | 2 | 240 | | 30B | 4 | 241 | | 65B | 8 | 242 | 243 | ### 🐒 Community Way 244 | 245 | There are two steps to run LLaMA in multi-GPU environment. 246 | 247 | - Convert original LLaMA model 248 | 249 | ```bash 250 | $python -m llama.convert_llama --help 251 | usage: convert_llama.py [-h] [--ckpt_dir CKPT_DIR] [--tokenizer_path TOKENIZER_PATH] 252 | [--model_size {7B,13B,30B,65B}] [--output_dir OUTPUT_DIR] 253 | [--max_batch_size MAX_BATCH_SIZE] [--to {hf,fb}] 254 | 255 | optional arguments: 256 | -h, --help show this help message and exit 257 | --ckpt_dir CKPT_DIR 258 | --tokenizer_path TOKENIZER_PATH 259 | --model_size {7B,13B,30B,65B} 260 | --output_dir OUTPUT_DIR 261 | Location to write HF model and tokenizer 262 | --max_batch_size MAX_BATCH_SIZE 263 | --to {hf,fb} 264 | ``` 265 | 266 | - Run with HF's accelerate with multiple GPUs 267 | 268 | ```bash 269 | $python -m llama.llama_multigpu --help 270 | usage: llama_multigpu.py [-h] [--state_dict_dir STATE_DICT_DIR] [--model_size {7B,13B,30B,65B}] 271 | 272 | optional arguments: 273 | -h, --help show this help message and exit 274 | --state_dict_dir STATE_DICT_DIR 275 | --model_size {7B,13B,30B,65B} 276 | ``` 277 | 278 | ![](https://github.com/juncongmoo/pyllama/blob/main/docs/llama_multigpu.png) 279 | 280 | ## 🔬 Model Fine Tuning 281 | 282 | ### With [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) Instruction-Following Dataset 283 | 284 | - Tokenization 285 | - Finetuning 286 | - Efficient FT 287 | 288 | ## 🧬 LLaMA model structure 289 | 290 | - Meta 291 | - Hugging Face 292 | 293 | ``` 294 | https://github.com/facebookresearch/llama/blob/main/llama/model.py#LL127C27-L127C27 295 | ``` 296 | 297 | ### Model Card 298 | 299 | See [MODEL_CARD.md](https://github.com/juncongmoo/pyllama/blob/main/MODEL_CARD.md) 300 | 301 | ### License 302 | 303 | See the [LICENSE](https://github.com/juncongmoo/pyllama/blob/main/LICENSE) file. 304 | -------------------------------------------------------------------------------- /apps/flask/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | 4 | -------------------------------------------------------------------------------- /apps/flask/web_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from typing import Tuple 5 | import os 6 | import sys 7 | import argparse 8 | import torch 9 | import time 10 | import json 11 | 12 | from pathlib import Path 13 | from typing import List 14 | 15 | from pydantic import BaseModel 16 | from fastapi import FastAPI 17 | import uvicorn 18 | import torch.distributed as dist 19 | 20 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel 21 | 22 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA 23 | 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--ckpt_dir", type=str, required=True) 27 | parser.add_argument("--tokenizer_path", type=str, required=True) 28 | parser.add_argument("--max_seq_len", type=int, default=512) 29 | parser.add_argument("--max_batch_size", type=int, default=1) 30 | 31 | 32 | app = FastAPI() 33 | 34 | 35 | def setup_model_parallel() -> Tuple[int, int]: 36 | local_rank = int(os.environ.get("LOCAL_RANK", -1)) 37 | world_size = int(os.environ.get("WORLD_SIZE", -1)) 38 | 39 | dist.init_process_group("nccl") 40 | initialize_model_parallel(world_size) 41 | torch.cuda.set_device(local_rank) 42 | 43 | # seed must be the same in all processes 44 | torch.manual_seed(1) 45 | return local_rank, world_size 46 | 47 | 48 | def load( 49 | ckpt_dir: str, 50 | tokenizer_path: str, 51 | local_rank: int, 52 | world_size: int, 53 | max_seq_len: int, 54 | max_batch_size: int, 55 | ) -> LLaMA: 56 | start_time = time.time() 57 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 58 | assert world_size == len( 59 | checkpoints 60 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 61 | ckpt_path = checkpoints[local_rank] 62 | print("Loading") 63 | checkpoint = torch.load(ckpt_path, map_location="cpu") 64 | with open(Path(ckpt_dir) / "params.json", "r") as f: 65 | params = json.loads(f.read()) 66 | 67 | model_args: ModelArgs = ModelArgs( 68 | max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params 69 | ) 70 | tokenizer = Tokenizer(model_path=tokenizer_path) 71 | model_args.vocab_size = tokenizer.n_words 72 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 73 | model = Transformer(model_args) 74 | torch.set_default_tensor_type(torch.FloatTensor) 75 | model.load_state_dict(checkpoint, strict=False) 76 | 77 | generator = LLaMA(model, tokenizer) 78 | print(f"Loaded in {time.time() - start_time:.2f} seconds") 79 | return generator 80 | 81 | 82 | def init_generator( 83 | ckpt_dir: str, 84 | tokenizer_path: str, 85 | max_seq_len: int = 512, 86 | max_batch_size: int = 32, 87 | ): 88 | local_rank, world_size = setup_model_parallel() 89 | if local_rank > 0: 90 | sys.stdout = open(os.devnull, "w") 91 | 92 | generator = load( 93 | ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size 94 | ) 95 | 96 | return generator 97 | 98 | 99 | if __name__ == "__main__": 100 | args = parser.parse_args() 101 | generator = init_generator( 102 | args.ckpt_dir, 103 | args.tokenizer_path, 104 | args.max_seq_len, 105 | args.max_batch_size, 106 | ) 107 | 108 | class Config(BaseModel): 109 | prompts: List[str] 110 | max_gen_len: int 111 | temperature: float = 0.8 112 | top_p: float = 0.95 113 | 114 | if dist.get_rank() == 0: 115 | 116 | @app.post("/llama/") 117 | def generate(config: Config): 118 | if len(config.prompts) > args.max_batch_size: 119 | return {"error": "too much prompts."} 120 | for prompt in config.prompts: 121 | if len(prompt) + config.max_gen_len > args.max_seq_len: 122 | return {"error": "max_gen_len too large."} 123 | dist.broadcast_object_list( 124 | [config.prompts, config.max_gen_len, config.temperature, config.top_p] 125 | ) 126 | 127 | results = generator.generate( 128 | config.prompts, 129 | max_gen_len=config.max_gen_len, 130 | temperature=config.temperature, 131 | top_p=config.top_p, 132 | ) 133 | 134 | return {"responses": results} 135 | 136 | uvicorn.run(app, host="0.0.0.0", port=8042) 137 | else: 138 | while True: 139 | config = [None] * 4 140 | try: 141 | dist.broadcast_object_list(config) 142 | generator.generate( 143 | config[0], 144 | max_gen_len=config[1], 145 | temperature=config[2], 146 | top_p=config[3], 147 | ) 148 | except: 149 | pass 150 | -------------------------------------------------------------------------------- /apps/flask/web_server_single.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from pydantic import BaseModel 8 | from fastapi import FastAPI 9 | import uvicorn 10 | import torch.distributed as dist 11 | 12 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA 13 | 14 | 15 | def get_args(): 16 | import argparse 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 20 | parser.add_argument( 21 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 22 | ) 23 | parser.add_argument("--max_seq_len", type=int, default=512) 24 | parser.add_argument("--max_batch_size", type=int, default=1) 25 | return parser.parse_args() 26 | 27 | 28 | app = FastAPI() 29 | 30 | 31 | def load( 32 | ckpt_dir: str, 33 | tokenizer_path: str, 34 | local_rank: int, 35 | world_size: int, 36 | max_seq_len: int, 37 | max_batch_size: int, 38 | ) -> LLaMA: 39 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 40 | assert world_size == len( 41 | checkpoints 42 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 43 | ckpt_path = checkpoints[local_rank] 44 | 45 | checkpoint = torch.load(ckpt_path, map_location="cpu") 46 | 47 | with open(Path(ckpt_dir) / "params.json", "r") as f: 48 | params = json.loads(f.read()) 49 | 50 | model_args: ModelArgs = ModelArgs( 51 | max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params 52 | ) 53 | tokenizer = Tokenizer(model_path=tokenizer_path) 54 | model_args.vocab_size = tokenizer.n_words 55 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 56 | model = Transformer(model_args) 57 | torch.set_default_tensor_type(torch.FloatTensor) 58 | model.load_state_dict(checkpoint, strict=False) 59 | generator = LLaMA(model, tokenizer) 60 | return generator 61 | 62 | 63 | def init_generator( 64 | ckpt_dir: str, 65 | tokenizer_path: str, 66 | max_seq_len: int = 512, 67 | max_batch_size: int = 1, 68 | ): 69 | local_rank, world_size = 0, 1 70 | generator = load( 71 | ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size 72 | ) 73 | 74 | return generator 75 | 76 | 77 | if __name__ == "__main__": 78 | args = get_args() 79 | generator = init_generator( 80 | args.ckpt_dir, 81 | args.tokenizer_path, 82 | args.max_seq_len, 83 | args.max_batch_size, 84 | ) 85 | 86 | class Config(BaseModel): 87 | prompts: List[str] 88 | max_gen_len: int 89 | temperature: float = 0.8 90 | top_p: float = 0.95 91 | 92 | @app.post("/llama/") 93 | def generate(config: Config): 94 | if len(config.prompts) > args.max_batch_size: 95 | return {"error": "too much prompts."} 96 | for prompt in config.prompts: 97 | if len(prompt) + config.max_gen_len > args.max_seq_len: 98 | return {"error": "max_gen_len too large."} 99 | results = generator.generate( 100 | config.prompts, 101 | max_gen_len=config.max_gen_len, 102 | temperature=config.temperature, 103 | top_p=config.top_p, 104 | ) 105 | return {"responses": results} 106 | 107 | uvicorn.run(app, host="0.0.0.0", port=8080) 108 | -------------------------------------------------------------------------------- /apps/gradio/run.sh: -------------------------------------------------------------------------------- 1 | # 2 | # first build the virtualenv using the virtualenv.sh script 3 | # 4 | # gradio webapp.py 5 | torchrun --nproc_per_node $MP webapp.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH 6 | # 7 | # or use CUDA_VISIBLE_DEVICES if you want to target a specific gpu device 8 | # CUDA_VISIBLE_DEVICES=1 torchrun --nproc_per_node $MP webapp.py 9 | # 10 | -------------------------------------------------------------------------------- /apps/gradio/set_up_venv.sh: -------------------------------------------------------------------------------- 1 | rm -rf llama_env 2 | python3 -m venv llama_env 3 | source llama_env/bin/activate 4 | 5 | pip uninstall llama -U 6 | pip install pyllama -U 7 | pip install gradio 8 | 9 | -------------------------------------------------------------------------------- /apps/gradio/webapp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import fire 5 | import time 6 | import json 7 | 8 | import gradio as gr 9 | 10 | from typing import Tuple 11 | from pathlib import Path 12 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel 13 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA 14 | 15 | 16 | def setup_model_parallel() -> Tuple[int, int]: 17 | local_rank = int(os.environ.get("LOCAL_RANK", -1)) 18 | world_size = int(os.environ.get("WORLD_SIZE", -1)) 19 | 20 | torch.distributed.init_process_group("nccl") 21 | initialize_model_parallel(world_size) 22 | torch.cuda.set_device(local_rank) 23 | 24 | # seed must be the same in all processes 25 | torch.manual_seed(1) 26 | return local_rank, world_size 27 | 28 | 29 | def load( 30 | ckpt_dir: str, 31 | tokenizer_path: str, 32 | local_rank: int, 33 | world_size: int, 34 | max_seq_len: int, 35 | max_batch_size: int, 36 | ) -> LLaMA: 37 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 38 | assert world_size == len( 39 | checkpoints 40 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 41 | ckpt_path = checkpoints[local_rank] 42 | 43 | checkpoint = torch.load(ckpt_path, map_location="cpu") 44 | 45 | with open(Path(ckpt_dir) / "params.json", "r") as f: 46 | params = json.loads(f.read()) 47 | 48 | model_args: ModelArgs = ModelArgs( 49 | max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params 50 | ) 51 | tokenizer = Tokenizer(model_path=tokenizer_path) 52 | model_args.vocab_size = tokenizer.n_words 53 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 54 | model = Transformer(model_args) 55 | torch.set_default_tensor_type(torch.FloatTensor) 56 | model.load_state_dict(checkpoint, strict=False) 57 | generator = LLaMA(model, tokenizer) 58 | return generator 59 | 60 | 61 | def process(prompt: str): 62 | print("Received:\n", prompt) 63 | prompts = [prompt] 64 | results = generator.generate( 65 | prompts, max_gen_len=256, temperature=temperature, top_p=top_p 66 | ) 67 | print("Generated:\n", results[0]) 68 | return str(results[0]) 69 | 70 | 71 | def get_args(): 72 | import argparse 73 | 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 76 | parser.add_argument( 77 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 78 | ) 79 | return parser.parse_args() 80 | 81 | 82 | if __name__ == "__main__": 83 | args = get_args() 84 | ckpt_dir = args.ckpt_dir 85 | tokenizer_path = args.tokenizer_path 86 | temperature = 0.8 87 | top_p = 0.95 88 | max_seq_len = 512 89 | max_batch_size = 32 90 | 91 | local_rank, world_size = setup_model_parallel() 92 | if local_rank > 0: 93 | sys.stdout = open(os.devnull, "w") 94 | 95 | generator = load( 96 | ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size 97 | ) 98 | 99 | demo = gr.Interface( 100 | fn=process, 101 | inputs=gr.Textbox(lines=10, placeholder="Your prompt here..."), 102 | outputs="text", 103 | ) 104 | 105 | # To create a public link, set `share=True` in `launch()`. 106 | demo.launch(share=True) 107 | -------------------------------------------------------------------------------- /apps/gradio/webapp_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import fire 5 | import time 6 | import json 7 | 8 | import gradio as gr 9 | 10 | from typing import Tuple 11 | from pathlib import Path 12 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel 13 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA 14 | 15 | 16 | def load( 17 | ckpt_dir: str, 18 | tokenizer_path: str, 19 | local_rank: int, 20 | world_size: int, 21 | max_seq_len: int, 22 | max_batch_size: int, 23 | ) -> LLaMA: 24 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 25 | assert world_size == len( 26 | checkpoints 27 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 28 | ckpt_path = checkpoints[local_rank] 29 | 30 | checkpoint = torch.load(ckpt_path, map_location="cpu") 31 | 32 | with open(Path(ckpt_dir) / "params.json", "r") as f: 33 | params = json.loads(f.read()) 34 | 35 | model_args: ModelArgs = ModelArgs( 36 | max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params 37 | ) 38 | tokenizer = Tokenizer(model_path=tokenizer_path) 39 | model_args.vocab_size = tokenizer.n_words 40 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 41 | model = Transformer(model_args) 42 | torch.set_default_tensor_type(torch.FloatTensor) 43 | model.load_state_dict(checkpoint, strict=False) 44 | generator = LLaMA(model, tokenizer) 45 | return generator 46 | 47 | 48 | def process(prompt: str): 49 | print("Received:\n", prompt) 50 | prompts = [prompt] 51 | results = generator.generate( 52 | prompts, max_gen_len=256, temperature=temperature, top_p=top_p 53 | ) 54 | print("Generated:\n", results[0]) 55 | return str(results[0]) 56 | 57 | 58 | def get_args(): 59 | import argparse 60 | 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 63 | parser.add_argument( 64 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 65 | ) 66 | return parser.parse_args() 67 | 68 | 69 | if __name__ == "__main__": 70 | args = get_args() 71 | ckpt_dir = args.ckpt_dir 72 | tokenizer_path = args.tokenizer_path 73 | temperature = 0.8 74 | top_p = 0.95 75 | max_seq_len = 512 76 | max_batch_size = 1 77 | 78 | local_rank, world_size = 0, 1 79 | generator = load( 80 | ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size 81 | ) 82 | 83 | demo = gr.Interface( 84 | fn=process, 85 | inputs=gr.Textbox(lines=10, placeholder="Your prompt here..."), 86 | outputs="text", 87 | ) 88 | 89 | # To create a public link, set `share=True` in `launch()`. 90 | demo.launch(share=True) 91 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | $python inference.py 3 | Loading 4 | type(checkpoint): 5 | LLaMA Core model: 6 | Transformer( 7 | (tok_embeddings): Embedding(32000, 4096) 8 | (layers): ModuleList( 9 | (0): TransformerBlock( 10 | (attention): Attention( 11 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 12 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 13 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 14 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 15 | ) 16 | (feed_forward): FeedForward( 17 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 18 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 19 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 20 | ) 21 | (attention_norm): RMSNorm() 22 | (ffn_norm): RMSNorm() 23 | ) 24 | (1): TransformerBlock( 25 | (attention): Attention( 26 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 27 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 28 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 29 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 30 | ) 31 | (feed_forward): FeedForward( 32 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 33 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 34 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 35 | ) 36 | (attention_norm): RMSNorm() 37 | (ffn_norm): RMSNorm() 38 | ) 39 | (2): TransformerBlock( 40 | (attention): Attention( 41 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 42 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 43 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 44 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 45 | ) 46 | (feed_forward): FeedForward( 47 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 48 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 49 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 50 | ) 51 | (attention_norm): RMSNorm() 52 | (ffn_norm): RMSNorm() 53 | ) 54 | (3): TransformerBlock( 55 | (attention): Attention( 56 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 57 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 58 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 59 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 60 | ) 61 | (feed_forward): FeedForward( 62 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 63 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 64 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 65 | ) 66 | (attention_norm): RMSNorm() 67 | (ffn_norm): RMSNorm() 68 | ) 69 | (4): TransformerBlock( 70 | (attention): Attention( 71 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 72 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 73 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 74 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 75 | ) 76 | (feed_forward): FeedForward( 77 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 78 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 79 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 80 | ) 81 | (attention_norm): RMSNorm() 82 | (ffn_norm): RMSNorm() 83 | ) 84 | (5): TransformerBlock( 85 | (attention): Attention( 86 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 87 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 88 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 89 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 90 | ) 91 | (feed_forward): FeedForward( 92 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 93 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 94 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 95 | ) 96 | (attention_norm): RMSNorm() 97 | (ffn_norm): RMSNorm() 98 | ) 99 | (6): TransformerBlock( 100 | (attention): Attention( 101 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 102 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 103 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 104 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 105 | ) 106 | (feed_forward): FeedForward( 107 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 108 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 109 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 110 | ) 111 | (attention_norm): RMSNorm() 112 | (ffn_norm): RMSNorm() 113 | ) 114 | (7): TransformerBlock( 115 | (attention): Attention( 116 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 117 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 118 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 119 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 120 | ) 121 | (feed_forward): FeedForward( 122 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 123 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 124 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 125 | ) 126 | (attention_norm): RMSNorm() 127 | (ffn_norm): RMSNorm() 128 | ) 129 | (8): TransformerBlock( 130 | (attention): Attention( 131 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 132 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 133 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 134 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 135 | ) 136 | (feed_forward): FeedForward( 137 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 138 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 139 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 140 | ) 141 | (attention_norm): RMSNorm() 142 | (ffn_norm): RMSNorm() 143 | ) 144 | (9): TransformerBlock( 145 | (attention): Attention( 146 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 147 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 148 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 149 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 150 | ) 151 | (feed_forward): FeedForward( 152 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 153 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 154 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 155 | ) 156 | (attention_norm): RMSNorm() 157 | (ffn_norm): RMSNorm() 158 | ) 159 | (10): TransformerBlock( 160 | (attention): Attention( 161 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 162 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 163 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 164 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 165 | ) 166 | (feed_forward): FeedForward( 167 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 168 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 169 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 170 | ) 171 | (attention_norm): RMSNorm() 172 | (ffn_norm): RMSNorm() 173 | ) 174 | (11): TransformerBlock( 175 | (attention): Attention( 176 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 177 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 178 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 179 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 180 | ) 181 | (feed_forward): FeedForward( 182 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 183 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 184 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 185 | ) 186 | (attention_norm): RMSNorm() 187 | (ffn_norm): RMSNorm() 188 | ) 189 | (12): TransformerBlock( 190 | (attention): Attention( 191 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 192 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 193 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 194 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 195 | ) 196 | (feed_forward): FeedForward( 197 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 198 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 199 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 200 | ) 201 | (attention_norm): RMSNorm() 202 | (ffn_norm): RMSNorm() 203 | ) 204 | (13): TransformerBlock( 205 | (attention): Attention( 206 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 207 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 208 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 209 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 210 | ) 211 | (feed_forward): FeedForward( 212 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 213 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 214 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 215 | ) 216 | (attention_norm): RMSNorm() 217 | (ffn_norm): RMSNorm() 218 | ) 219 | (14): TransformerBlock( 220 | (attention): Attention( 221 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 222 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 223 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 224 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 225 | ) 226 | (feed_forward): FeedForward( 227 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 228 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 229 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 230 | ) 231 | (attention_norm): RMSNorm() 232 | (ffn_norm): RMSNorm() 233 | ) 234 | (15): TransformerBlock( 235 | (attention): Attention( 236 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 237 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 238 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 239 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 240 | ) 241 | (feed_forward): FeedForward( 242 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 243 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 244 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 245 | ) 246 | (attention_norm): RMSNorm() 247 | (ffn_norm): RMSNorm() 248 | ) 249 | (16): TransformerBlock( 250 | (attention): Attention( 251 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 252 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 253 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 254 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 255 | ) 256 | (feed_forward): FeedForward( 257 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 258 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 259 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 260 | ) 261 | (attention_norm): RMSNorm() 262 | (ffn_norm): RMSNorm() 263 | ) 264 | (17): TransformerBlock( 265 | (attention): Attention( 266 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 267 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 268 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 269 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 270 | ) 271 | (feed_forward): FeedForward( 272 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 273 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 274 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 275 | ) 276 | (attention_norm): RMSNorm() 277 | (ffn_norm): RMSNorm() 278 | ) 279 | (18): TransformerBlock( 280 | (attention): Attention( 281 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 282 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 283 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 284 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 285 | ) 286 | (feed_forward): FeedForward( 287 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 288 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 289 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 290 | ) 291 | (attention_norm): RMSNorm() 292 | (ffn_norm): RMSNorm() 293 | ) 294 | (19): TransformerBlock( 295 | (attention): Attention( 296 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 297 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 298 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 299 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 300 | ) 301 | (feed_forward): FeedForward( 302 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 303 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 304 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 305 | ) 306 | (attention_norm): RMSNorm() 307 | (ffn_norm): RMSNorm() 308 | ) 309 | (20): TransformerBlock( 310 | (attention): Attention( 311 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 312 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 313 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 314 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 315 | ) 316 | (feed_forward): FeedForward( 317 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 318 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 319 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 320 | ) 321 | (attention_norm): RMSNorm() 322 | (ffn_norm): RMSNorm() 323 | ) 324 | (21): TransformerBlock( 325 | (attention): Attention( 326 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 327 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 328 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 329 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 330 | ) 331 | (feed_forward): FeedForward( 332 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 333 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 334 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 335 | ) 336 | (attention_norm): RMSNorm() 337 | (ffn_norm): RMSNorm() 338 | ) 339 | (22): TransformerBlock( 340 | (attention): Attention( 341 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 342 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 343 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 344 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 345 | ) 346 | (feed_forward): FeedForward( 347 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 348 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 349 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 350 | ) 351 | (attention_norm): RMSNorm() 352 | (ffn_norm): RMSNorm() 353 | ) 354 | (23): TransformerBlock( 355 | (attention): Attention( 356 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 357 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 358 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 359 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 360 | ) 361 | (feed_forward): FeedForward( 362 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 363 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 364 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 365 | ) 366 | (attention_norm): RMSNorm() 367 | (ffn_norm): RMSNorm() 368 | ) 369 | (24): TransformerBlock( 370 | (attention): Attention( 371 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 372 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 373 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 374 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 375 | ) 376 | (feed_forward): FeedForward( 377 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 378 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 379 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 380 | ) 381 | (attention_norm): RMSNorm() 382 | (ffn_norm): RMSNorm() 383 | ) 384 | (25): TransformerBlock( 385 | (attention): Attention( 386 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 387 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 388 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 389 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 390 | ) 391 | (feed_forward): FeedForward( 392 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 393 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 394 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 395 | ) 396 | (attention_norm): RMSNorm() 397 | (ffn_norm): RMSNorm() 398 | ) 399 | (26): TransformerBlock( 400 | (attention): Attention( 401 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 402 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 403 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 404 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 405 | ) 406 | (feed_forward): FeedForward( 407 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 408 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 409 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 410 | ) 411 | (attention_norm): RMSNorm() 412 | (ffn_norm): RMSNorm() 413 | ) 414 | (27): TransformerBlock( 415 | (attention): Attention( 416 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 417 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 418 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 419 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 420 | ) 421 | (feed_forward): FeedForward( 422 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 423 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 424 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 425 | ) 426 | (attention_norm): RMSNorm() 427 | (ffn_norm): RMSNorm() 428 | ) 429 | (28): TransformerBlock( 430 | (attention): Attention( 431 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 432 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 433 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 434 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 435 | ) 436 | (feed_forward): FeedForward( 437 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 438 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 439 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 440 | ) 441 | (attention_norm): RMSNorm() 442 | (ffn_norm): RMSNorm() 443 | ) 444 | (29): TransformerBlock( 445 | (attention): Attention( 446 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 447 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 448 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 449 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 450 | ) 451 | (feed_forward): FeedForward( 452 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 453 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 454 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 455 | ) 456 | (attention_norm): RMSNorm() 457 | (ffn_norm): RMSNorm() 458 | ) 459 | (30): TransformerBlock( 460 | (attention): Attention( 461 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 462 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 463 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 464 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 465 | ) 466 | (feed_forward): FeedForward( 467 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 468 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 469 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 470 | ) 471 | (attention_norm): RMSNorm() 472 | (ffn_norm): RMSNorm() 473 | ) 474 | (31): TransformerBlock( 475 | (attention): Attention( 476 | (wq): Linear(in_features=4096, out_features=4096, bias=False) 477 | (wk): Linear(in_features=4096, out_features=4096, bias=False) 478 | (wv): Linear(in_features=4096, out_features=4096, bias=False) 479 | (wo): Linear(in_features=4096, out_features=4096, bias=False) 480 | ) 481 | (feed_forward): FeedForward( 482 | (w1): Linear(in_features=4096, out_features=11008, bias=False) 483 | (w2): Linear(in_features=11008, out_features=4096, bias=False) 484 | (w3): Linear(in_features=4096, out_features=11008, bias=False) 485 | ) 486 | (attention_norm): RMSNorm() 487 | (ffn_norm): RMSNorm() 488 | ) 489 | ) 490 | (norm): RMSNorm() 491 | (output): Linear(in_features=4096, out_features=32000, bias=False) 492 | ) 493 | Loaded in 19.94 seconds 494 | I believe the meaning of life is to appreciate everything you have. 495 | This is a journey for me to follow my heart and do what I love. I believe that life is to be lived in the moment and to give yourself the opportunity to dream and have the courage to pursue those dreams. 496 | Everything I do in my life is based on living in the moment. I think it is important to remember that we only have the moment with us. Life is fragile and there is no time to waste. We should all live in the moment and make the most of our lives. 497 | I am not a believer in good or bad. Everything that happens in our lives is the right thing for us to have at that time. We are always moving and growing and learning and life is a wonderful journey. 498 | My role as an artist is to try and depict my feelings and what I am going through. I hope that when people view my art they will also feel my emotions and connect to the pieces. 499 | I am an Australian born artist living in Perth, Australia. I work with acrylic paints, inks, gouache, oil paints and mixed media. I have been a full-time artist since 2008. I have exhibited 500 | ================================== 501 | ``` 502 | -------------------------------------------------------------------------------- /docs/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/download.png -------------------------------------------------------------------------------- /docs/llama_hf.md: -------------------------------------------------------------------------------- 1 | ``` 2 | LLaMAForCausalLM( 3 | (model): LLaMAModel( 4 | (embed_tokens): Embedding(32000, 4096, padding_idx=31999) 5 | (layers): ModuleList( 6 | (0): LLaMADecoderLayer( 7 | (self_attn): LLaMAAttention( 8 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 9 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 10 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 11 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 12 | (rotary_emb): RotaryEmbedding() 13 | ) 14 | (mlp): LLaMAMLP( 15 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 16 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 17 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 18 | (act_fn): SiLUActivation() 19 | ) 20 | (input_layernorm): RMSNorm() 21 | (post_attention_layernorm): RMSNorm() 22 | ) 23 | (1): LLaMADecoderLayer( 24 | (self_attn): LLaMAAttention( 25 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 26 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 27 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 28 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 29 | (rotary_emb): RotaryEmbedding() 30 | ) 31 | (mlp): LLaMAMLP( 32 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 33 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 34 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 35 | (act_fn): SiLUActivation() 36 | ) 37 | (input_layernorm): RMSNorm() 38 | (post_attention_layernorm): RMSNorm() 39 | ) 40 | (2): LLaMADecoderLayer( 41 | (self_attn): LLaMAAttention( 42 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 43 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 44 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 45 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 46 | (rotary_emb): RotaryEmbedding() 47 | ) 48 | (mlp): LLaMAMLP( 49 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 50 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 51 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 52 | (act_fn): SiLUActivation() 53 | ) 54 | (input_layernorm): RMSNorm() 55 | (post_attention_layernorm): RMSNorm() 56 | ) 57 | (3): LLaMADecoderLayer( 58 | (self_attn): LLaMAAttention( 59 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 60 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 61 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 62 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 63 | (rotary_emb): RotaryEmbedding() 64 | ) 65 | (mlp): LLaMAMLP( 66 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 67 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 68 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 69 | (act_fn): SiLUActivation() 70 | ) 71 | (input_layernorm): RMSNorm() 72 | (post_attention_layernorm): RMSNorm() 73 | ) 74 | (4): LLaMADecoderLayer( 75 | (self_attn): LLaMAAttention( 76 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 77 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 78 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 79 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 80 | (rotary_emb): RotaryEmbedding() 81 | ) 82 | (mlp): LLaMAMLP( 83 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 84 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 85 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 86 | (act_fn): SiLUActivation() 87 | ) 88 | (input_layernorm): RMSNorm() 89 | (post_attention_layernorm): RMSNorm() 90 | ) 91 | (5): LLaMADecoderLayer( 92 | (self_attn): LLaMAAttention( 93 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 94 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 95 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 96 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 97 | (rotary_emb): RotaryEmbedding() 98 | ) 99 | (mlp): LLaMAMLP( 100 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 101 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 102 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 103 | (act_fn): SiLUActivation() 104 | ) 105 | (input_layernorm): RMSNorm() 106 | (post_attention_layernorm): RMSNorm() 107 | ) 108 | (6): LLaMADecoderLayer( 109 | (self_attn): LLaMAAttention( 110 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 111 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 112 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 113 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 114 | (rotary_emb): RotaryEmbedding() 115 | ) 116 | (mlp): LLaMAMLP( 117 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 118 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 119 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 120 | (act_fn): SiLUActivation() 121 | ) 122 | (input_layernorm): RMSNorm() 123 | (post_attention_layernorm): RMSNorm() 124 | ) 125 | (7): LLaMADecoderLayer( 126 | (self_attn): LLaMAAttention( 127 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 128 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 129 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 130 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 131 | (rotary_emb): RotaryEmbedding() 132 | ) 133 | (mlp): LLaMAMLP( 134 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 135 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 136 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 137 | (act_fn): SiLUActivation() 138 | ) 139 | (input_layernorm): RMSNorm() 140 | (post_attention_layernorm): RMSNorm() 141 | ) 142 | (8): LLaMADecoderLayer( 143 | (self_attn): LLaMAAttention( 144 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 145 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 146 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 147 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 148 | (rotary_emb): RotaryEmbedding() 149 | ) 150 | (mlp): LLaMAMLP( 151 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 152 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 153 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 154 | (act_fn): SiLUActivation() 155 | ) 156 | (input_layernorm): RMSNorm() 157 | (post_attention_layernorm): RMSNorm() 158 | ) 159 | (9): LLaMADecoderLayer( 160 | (self_attn): LLaMAAttention( 161 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 162 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 163 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 164 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 165 | (rotary_emb): RotaryEmbedding() 166 | ) 167 | (mlp): LLaMAMLP( 168 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 169 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 170 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 171 | (act_fn): SiLUActivation() 172 | ) 173 | (input_layernorm): RMSNorm() 174 | (post_attention_layernorm): RMSNorm() 175 | ) 176 | (10): LLaMADecoderLayer( 177 | (self_attn): LLaMAAttention( 178 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 179 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 180 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 181 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 182 | (rotary_emb): RotaryEmbedding() 183 | ) 184 | (mlp): LLaMAMLP( 185 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 186 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 187 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 188 | (act_fn): SiLUActivation() 189 | ) 190 | (input_layernorm): RMSNorm() 191 | (post_attention_layernorm): RMSNorm() 192 | ) 193 | (11): LLaMADecoderLayer( 194 | (self_attn): LLaMAAttention( 195 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 196 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 197 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 198 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 199 | (rotary_emb): RotaryEmbedding() 200 | ) 201 | (mlp): LLaMAMLP( 202 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 203 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 204 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 205 | (act_fn): SiLUActivation() 206 | ) 207 | (input_layernorm): RMSNorm() 208 | (post_attention_layernorm): RMSNorm() 209 | ) 210 | (12): LLaMADecoderLayer( 211 | (self_attn): LLaMAAttention( 212 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 213 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 214 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 215 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 216 | (rotary_emb): RotaryEmbedding() 217 | ) 218 | (mlp): LLaMAMLP( 219 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 220 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 221 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 222 | (act_fn): SiLUActivation() 223 | ) 224 | (input_layernorm): RMSNorm() 225 | (post_attention_layernorm): RMSNorm() 226 | ) 227 | (13): LLaMADecoderLayer( 228 | (self_attn): LLaMAAttention( 229 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 230 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 231 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 232 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 233 | (rotary_emb): RotaryEmbedding() 234 | ) 235 | (mlp): LLaMAMLP( 236 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 237 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 238 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 239 | (act_fn): SiLUActivation() 240 | ) 241 | (input_layernorm): RMSNorm() 242 | (post_attention_layernorm): RMSNorm() 243 | ) 244 | (14): LLaMADecoderLayer( 245 | (self_attn): LLaMAAttention( 246 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 247 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 248 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 249 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 250 | (rotary_emb): RotaryEmbedding() 251 | ) 252 | (mlp): LLaMAMLP( 253 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 254 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 255 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 256 | (act_fn): SiLUActivation() 257 | ) 258 | (input_layernorm): RMSNorm() 259 | (post_attention_layernorm): RMSNorm() 260 | ) 261 | (15): LLaMADecoderLayer( 262 | (self_attn): LLaMAAttention( 263 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 264 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 265 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 266 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 267 | (rotary_emb): RotaryEmbedding() 268 | ) 269 | (mlp): LLaMAMLP( 270 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 271 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 272 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 273 | (act_fn): SiLUActivation() 274 | ) 275 | (input_layernorm): RMSNorm() 276 | (post_attention_layernorm): RMSNorm() 277 | ) 278 | (16): LLaMADecoderLayer( 279 | (self_attn): LLaMAAttention( 280 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 281 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 282 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 283 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 284 | (rotary_emb): RotaryEmbedding() 285 | ) 286 | (mlp): LLaMAMLP( 287 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 288 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 289 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 290 | (act_fn): SiLUActivation() 291 | ) 292 | (input_layernorm): RMSNorm() 293 | (post_attention_layernorm): RMSNorm() 294 | ) 295 | (17): LLaMADecoderLayer( 296 | (self_attn): LLaMAAttention( 297 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 298 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 299 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 300 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 301 | (rotary_emb): RotaryEmbedding() 302 | ) 303 | (mlp): LLaMAMLP( 304 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 305 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 306 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 307 | (act_fn): SiLUActivation() 308 | ) 309 | (input_layernorm): RMSNorm() 310 | (post_attention_layernorm): RMSNorm() 311 | ) 312 | (18): LLaMADecoderLayer( 313 | (self_attn): LLaMAAttention( 314 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 315 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 316 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 317 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 318 | (rotary_emb): RotaryEmbedding() 319 | ) 320 | (mlp): LLaMAMLP( 321 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 322 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 323 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 324 | (act_fn): SiLUActivation() 325 | ) 326 | (input_layernorm): RMSNorm() 327 | (post_attention_layernorm): RMSNorm() 328 | ) 329 | (19): LLaMADecoderLayer( 330 | (self_attn): LLaMAAttention( 331 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 332 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 333 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 334 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 335 | (rotary_emb): RotaryEmbedding() 336 | ) 337 | (mlp): LLaMAMLP( 338 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 339 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 340 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 341 | (act_fn): SiLUActivation() 342 | ) 343 | (input_layernorm): RMSNorm() 344 | (post_attention_layernorm): RMSNorm() 345 | ) 346 | (20): LLaMADecoderLayer( 347 | (self_attn): LLaMAAttention( 348 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 349 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 350 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 351 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 352 | (rotary_emb): RotaryEmbedding() 353 | ) 354 | (mlp): LLaMAMLP( 355 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 356 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 357 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 358 | (act_fn): SiLUActivation() 359 | ) 360 | (input_layernorm): RMSNorm() 361 | (post_attention_layernorm): RMSNorm() 362 | ) 363 | (21): LLaMADecoderLayer( 364 | (self_attn): LLaMAAttention( 365 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 366 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 367 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 368 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 369 | (rotary_emb): RotaryEmbedding() 370 | ) 371 | (mlp): LLaMAMLP( 372 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 373 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 374 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 375 | (act_fn): SiLUActivation() 376 | ) 377 | (input_layernorm): RMSNorm() 378 | (post_attention_layernorm): RMSNorm() 379 | ) 380 | (22): LLaMADecoderLayer( 381 | (self_attn): LLaMAAttention( 382 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 383 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 384 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 385 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 386 | (rotary_emb): RotaryEmbedding() 387 | ) 388 | (mlp): LLaMAMLP( 389 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 390 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 391 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 392 | (act_fn): SiLUActivation() 393 | ) 394 | (input_layernorm): RMSNorm() 395 | (post_attention_layernorm): RMSNorm() 396 | ) 397 | (23): LLaMADecoderLayer( 398 | (self_attn): LLaMAAttention( 399 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 400 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 401 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 402 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 403 | (rotary_emb): RotaryEmbedding() 404 | ) 405 | (mlp): LLaMAMLP( 406 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 407 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 408 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 409 | (act_fn): SiLUActivation() 410 | ) 411 | (input_layernorm): RMSNorm() 412 | (post_attention_layernorm): RMSNorm() 413 | ) 414 | (24): LLaMADecoderLayer( 415 | (self_attn): LLaMAAttention( 416 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 417 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 418 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 419 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 420 | (rotary_emb): RotaryEmbedding() 421 | ) 422 | (mlp): LLaMAMLP( 423 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 424 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 425 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 426 | (act_fn): SiLUActivation() 427 | ) 428 | (input_layernorm): RMSNorm() 429 | (post_attention_layernorm): RMSNorm() 430 | ) 431 | (25): LLaMADecoderLayer( 432 | (self_attn): LLaMAAttention( 433 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 434 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 435 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 436 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 437 | (rotary_emb): RotaryEmbedding() 438 | ) 439 | (mlp): LLaMAMLP( 440 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 441 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 442 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 443 | (act_fn): SiLUActivation() 444 | ) 445 | (input_layernorm): RMSNorm() 446 | (post_attention_layernorm): RMSNorm() 447 | ) 448 | (26): LLaMADecoderLayer( 449 | (self_attn): LLaMAAttention( 450 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 451 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 452 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 453 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 454 | (rotary_emb): RotaryEmbedding() 455 | ) 456 | (mlp): LLaMAMLP( 457 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 458 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 459 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 460 | (act_fn): SiLUActivation() 461 | ) 462 | (input_layernorm): RMSNorm() 463 | (post_attention_layernorm): RMSNorm() 464 | ) 465 | (27): LLaMADecoderLayer( 466 | (self_attn): LLaMAAttention( 467 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 468 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 469 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 470 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 471 | (rotary_emb): RotaryEmbedding() 472 | ) 473 | (mlp): LLaMAMLP( 474 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 475 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 476 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 477 | (act_fn): SiLUActivation() 478 | ) 479 | (input_layernorm): RMSNorm() 480 | (post_attention_layernorm): RMSNorm() 481 | ) 482 | (28): LLaMADecoderLayer( 483 | (self_attn): LLaMAAttention( 484 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 485 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 486 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 487 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 488 | (rotary_emb): RotaryEmbedding() 489 | ) 490 | (mlp): LLaMAMLP( 491 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 492 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 493 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 494 | (act_fn): SiLUActivation() 495 | ) 496 | (input_layernorm): RMSNorm() 497 | (post_attention_layernorm): RMSNorm() 498 | ) 499 | (29): LLaMADecoderLayer( 500 | (self_attn): LLaMAAttention( 501 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 502 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 503 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 504 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 505 | (rotary_emb): RotaryEmbedding() 506 | ) 507 | (mlp): LLaMAMLP( 508 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 509 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 510 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 511 | (act_fn): SiLUActivation() 512 | ) 513 | (input_layernorm): RMSNorm() 514 | (post_attention_layernorm): RMSNorm() 515 | ) 516 | (30): LLaMADecoderLayer( 517 | (self_attn): LLaMAAttention( 518 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 519 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 520 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 521 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 522 | (rotary_emb): RotaryEmbedding() 523 | ) 524 | (mlp): LLaMAMLP( 525 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 526 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 527 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 528 | (act_fn): SiLUActivation() 529 | ) 530 | (input_layernorm): RMSNorm() 531 | (post_attention_layernorm): RMSNorm() 532 | ) 533 | (31): LLaMADecoderLayer( 534 | (self_attn): LLaMAAttention( 535 | (q_proj): Linear(in_features=4096, out_features=4096, bias=False) 536 | (k_proj): Linear(in_features=4096, out_features=4096, bias=False) 537 | (v_proj): Linear(in_features=4096, out_features=4096, bias=False) 538 | (o_proj): Linear(in_features=4096, out_features=4096, bias=False) 539 | (rotary_emb): RotaryEmbedding() 540 | ) 541 | (mlp): LLaMAMLP( 542 | (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) 543 | (down_proj): Linear(in_features=11008, out_features=4096, bias=False) 544 | (up_proj): Linear(in_features=4096, out_features=11008, bias=False) 545 | (act_fn): SiLUActivation() 546 | ) 547 | (input_layernorm): RMSNorm() 548 | (post_attention_layernorm): RMSNorm() 549 | ) 550 | ) 551 | (norm): RMSNorm() 552 | ) 553 | (lm_head): Linear(in_features=4096, out_features=32000, bias=False) 554 | ) 555 | ``` 556 | -------------------------------------------------------------------------------- /docs/llama_inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_inference.png -------------------------------------------------------------------------------- /docs/llama_multigpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_multigpu.png -------------------------------------------------------------------------------- /docs/llama_profiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_profiling.png -------------------------------------------------------------------------------- /docs/llama_webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_webui.png -------------------------------------------------------------------------------- /docs/pyllama_7B_3GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/pyllama_7B_3GB.png -------------------------------------------------------------------------------- /docs/pyllama_7B_6GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/pyllama_7B_6GB.png -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 4 | 5 | PRESIGNED_URL="" # edit this with the presigned url 6 | MODEL_SIZE="7B,13B,30B,65B" # edit this list with the model sizes you wish to download 7 | TARGET_FOLDER="" # where all files should end up 8 | 9 | declare -A N_SHARD_DICT 10 | 11 | N_SHARD_DICT["7B"]="0" 12 | N_SHARD_DICT["13B"]="1" 13 | N_SHARD_DICT["30B"]="3" 14 | N_SHARD_DICT["65B"]="7" 15 | 16 | echo "Downloading tokenizer" 17 | if cd ${TARGET_FOLDER} && [[ ! -f tokenizer.model ]] && [[ ! -f tokenizer_checklist.chk ]] && ! md5sum -c tokenizer_checklist.chk; then 18 | wget ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" 19 | wget ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" 20 | (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) 21 | else 22 | echo "Skipping downloading tokenizer, already exists and checksum matches" 23 | fi 24 | 25 | for i in ${MODEL_SIZE//,/ } 26 | do 27 | 28 | echo "Downloading ${i}" 29 | mkdir -p ${TARGET_FOLDER}"/${i}" 30 | 31 | file_name="${TARGET_FOLDER}/${i}/checklist.chk" 32 | echo "Downloading ${file_name}" 33 | if ! [[ -f "${file_name}" ]]; then 34 | wget ${PRESIGNED_URL/'*'/"${i}/checklist.chk"} -O ${TARGET_FOLDER}"/${i}/checklist.chk" 35 | else 36 | echo "Skipping downloading ${file_name}, already exists" 37 | fi 38 | for s in $(seq -f "0%g" 0 ${N_SHARD_DICT}) 39 | do 40 | echo $s 41 | file_name="consolidated.${s}.pth" 42 | echo $file_name 43 | checklist_file="${TARGET_FOLDER}/${i}/checklist.chk" 44 | echo "${checklist_file##*/}" 45 | checksum=$(grep "${file_name##*/}" "${checklist_file}" | cut -d' ' -f1) 46 | # echo $(cd ${TARGET_FOLDER}"/${i}" && md5sum 'consolidated.00.pth' | cut -d' ' -f1) 47 | 48 | if cd "${TARGET_FOLDER}/${i}" && ! [[ -f "${file_name}" ]] || ! [[ $(md5sum "${file_name}" | cut -d' ' -f1) == "${checksum}" ]]; then 49 | wget ${PRESIGNED_URL/'*'/"${i}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${i}/consolidated.${s}.pth" 50 | else 51 | echo "Skipping downloading ${file_name}, already exists and checksum matches" 52 | fi 53 | done 54 | file_name="params.json" 55 | if cd ${TARGET_FOLDER}/${i} && ! [[ -f "${file_name}" ]]; then 56 | wget ${PRESIGNED_URL/'*'/"${i}/params.json"} -O ${TARGET_FOLDER}"/${i}/params.json" 57 | else 58 | echo "Skipping downloading ${file_name}, already exists" 59 | fi 60 | (cd ${TARGET_FOLDER}"/${i}" && md5sum -c checklist.chk) 61 | done 62 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from typing import Tuple 5 | import os 6 | import sys 7 | import torch 8 | import fire 9 | import time 10 | import json 11 | 12 | from pathlib import Path 13 | 14 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel 15 | 16 | from llama import Tokenizer, LLaMA 17 | from llama.model_parallel import ModelArgs, Transformer 18 | 19 | 20 | def setup_model_parallel() -> Tuple[int, int]: 21 | local_rank = int(os.environ.get("LOCAL_RANK", -1)) 22 | world_size = int(os.environ.get("WORLD_SIZE", -1)) 23 | 24 | torch.distributed.init_process_group("nccl") 25 | initialize_model_parallel(world_size) 26 | torch.cuda.set_device(local_rank) 27 | 28 | # seed must be the same in all processes 29 | torch.manual_seed(1) 30 | return local_rank, world_size 31 | 32 | 33 | def load(ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int) -> LLaMA: 34 | start_time = time.time() 35 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 36 | assert world_size == len( 37 | checkpoints 38 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 39 | ckpt_path = checkpoints[local_rank] 40 | print("Loading") 41 | checkpoint = torch.load(ckpt_path, map_location="cpu") 42 | with open(Path(ckpt_dir) / "params.json", "r") as f: 43 | params = json.loads(f.read()) 44 | 45 | model_args: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=32, **params) 46 | tokenizer = Tokenizer(model_path=tokenizer_path) 47 | model_args.vocab_size = tokenizer.n_words 48 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 49 | model = Transformer(model_args) 50 | torch.set_default_tensor_type(torch.FloatTensor) 51 | model.load_state_dict(checkpoint, strict=False) 52 | 53 | generator = LLaMA(model, tokenizer) 54 | print(f"Loaded in {time.time() - start_time:.2f} seconds") 55 | return generator 56 | 57 | 58 | def main( 59 | ckpt_dir: str, tokenizer_path: str, temperature: float = 0.8, top_p: float = 0.95 60 | ): 61 | local_rank, world_size = setup_model_parallel() 62 | if local_rank > 0: 63 | sys.stdout = open(os.devnull, "w") 64 | 65 | generator = load(ckpt_dir, tokenizer_path, local_rank, world_size) 66 | prompts = [ 67 | "The capital of Germany is the city of", 68 | "Here is my sonnet in the style of Shakespeare about an artificial intelligence:", 69 | ] 70 | results = generator.generate( 71 | prompts, max_gen_len=256, temperature=temperature, top_p=top_p 72 | ) 73 | 74 | for result in results: 75 | print(result) 76 | print("\n==================================\n") 77 | 78 | 79 | if __name__ == "__main__": 80 | fire.Fire(main) 81 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import json 4 | from pathlib import Path 5 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA 6 | 7 | 8 | def load( 9 | ckpt_dir: str, 10 | tokenizer_path: str, 11 | local_rank: int, 12 | world_size: int, 13 | max_seq_len: int, 14 | max_batch_size: int, 15 | ) -> LLaMA: 16 | checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) 17 | assert world_size == len( 18 | checkpoints 19 | ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" 20 | ckpt_path = checkpoints[local_rank] 21 | 22 | checkpoint = torch.load(ckpt_path, map_location="cpu") 23 | 24 | with open(Path(ckpt_dir) / "params.json", "r") as f: 25 | params = json.loads(f.read()) 26 | 27 | model_args: ModelArgs = ModelArgs( 28 | max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params 29 | ) 30 | tokenizer = Tokenizer(model_path=tokenizer_path) 31 | model_args.vocab_size = tokenizer.n_words 32 | torch.set_default_tensor_type(torch.cuda.HalfTensor) 33 | model = Transformer(model_args) 34 | torch.set_default_tensor_type(torch.FloatTensor) 35 | model.load_state_dict(checkpoint, strict=False) 36 | generator = LLaMA(model, tokenizer) 37 | return generator 38 | 39 | 40 | def run( 41 | ckpt_dir: str, 42 | tokenizer_path: str, 43 | temperature: float = 0.8, 44 | top_p: float = 0.95, 45 | max_seq_len: int = 1024, 46 | max_batch_size: int = 1, 47 | ): 48 | local_rank = 0 49 | world_size = 1 50 | generator = load( 51 | ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size 52 | ) 53 | prompts = [ 54 | # For these prompts, the expected answer is the natural continuation of the prompt 55 | "I believe the meaning of life is", # removed: keep only one prompt 56 | ] 57 | while True: 58 | print("Prompt:", prompts) 59 | results = generator.generate( 60 | prompts, max_gen_len=256, temperature=temperature, top_p=top_p 61 | ) 62 | for result in results: 63 | print("🦙LLaMA:", result.strip()) 64 | 65 | user_input = input("please enter your prompts (Ctrl+C to exit): ") 66 | prompts = [user_input] 67 | 68 | 69 | def get_args(): 70 | import argparse 71 | 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 74 | parser.add_argument( 75 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 76 | ) 77 | return parser.parse_args() 78 | 79 | 80 | if __name__ == "__main__": 81 | args = get_args() 82 | run( 83 | ckpt_dir=args.ckpt_dir, 84 | tokenizer_path=args.tokenizer_path, 85 | temperature=0.8, 86 | top_p=0.95, 87 | max_seq_len=1024, 88 | max_batch_size=1, 89 | ) 90 | -------------------------------------------------------------------------------- /inference_driver.py: -------------------------------------------------------------------------------- 1 | import hiq, time 2 | from hiq.memory import total_gpu_memory_mb, get_memory_mb 3 | 4 | 5 | def run_main(): 6 | driver = hiq.HiQLatency( 7 | hiq_table_or_path=[ 8 | ["inference", "", "load", "load_llama"], 9 | ["llama.generation", "LLaMA", "generate", "generate"], 10 | # ["llama.model_single", "Transformer", "forward", "forward"], 11 | ], 12 | metric_funcs=[time.time, total_gpu_memory_mb, get_memory_mb], 13 | # extra_metrics={hiq.ExtraMetrics.ARGS}, 14 | ) 15 | 16 | args = hiq.mod("inference").get_args() 17 | hiq.mod("inference").run(args.ckpt_dir, args.tokenizer_path) 18 | print("*" * 30, "GPU/CPU/Latency Profiling", "*" * 30) 19 | driver.show() 20 | 21 | 22 | if __name__ == "__main__": 23 | run_main() 24 | -------------------------------------------------------------------------------- /llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .generation import LLaMA 2 | 3 | 4 | def pyllama_env(x, default=None) -> bool: 5 | import os, ast 6 | t = os.environ.get(x, default) 7 | if isinstance(t, str) and t: 8 | try: 9 | return bool(ast.literal_eval(t)) 10 | except: 11 | return True 12 | return bool(t) 13 | 14 | if pyllama_env("PYLLAMA_META_MP"): 15 | from .model_parallel import ModelArgs, Transformer 16 | else: 17 | from .model_single import ModelArgs, Transformer 18 | from .tokenizer import Tokenizer 19 | 20 | __version__ = "0.0.2" 21 | -------------------------------------------------------------------------------- /llama/convert_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import json 16 | import os 17 | import shutil 18 | 19 | import torch 20 | import hiq 21 | 22 | """ 23 | Sample usage: 24 | 25 | ``` 26 | python -m llama.convert_llama --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH \ 27 | --model 7B --output_dir converted_meta --to fb --max_batch_size 4 28 | ``` 29 | 30 | Thereafter, models can be loaded via: 31 | 32 | ``` 33 | tokenizer = llama.hf.LLaMATokenizer.from_pretrained("/output/path/tokenizer/") 34 | model = llama.hf.LLaMAForCausalLM.from_pretrained("/output/path/llama-7b/") 35 | ``` 36 | """ 37 | 38 | INTERMEDIATE_SIZE_MAP = { 39 | "7B": 11008, 40 | "13B": 13824, 41 | "30B": 17920, 42 | "65B": 22016, 43 | } 44 | NUM_SHARDS = { 45 | "7B": 1, 46 | "13B": 2, 47 | "30B": 4, 48 | "65B": 8, 49 | } 50 | META_KEY_TO_DIM = {"w1": 0, "w2": -1, "w3": 0, "wo": -1, "wq": 0, "wk": 0, "wv": 0, "output": 0, "tok_embeddings": -1, 51 | "ffn_norm": None, "attention_norm": None, "norm": None, "rope": None} 52 | 53 | def write_json(text, path): 54 | with open(path, "w") as f: 55 | json.dump(text, f) 56 | 57 | 58 | def write_model(model_path, input_base_path, model_size): 59 | assert model_size in INTERMEDIATE_SIZE_MAP 60 | os.makedirs(model_path, exist_ok=True) 61 | 62 | params = hiq.read_file(os.path.join(input_base_path, "params.json"), as_json=True) 63 | num_shards = NUM_SHARDS[model_size] 64 | n_layers = params["n_layers"] 65 | n_heads = params["n_heads"] 66 | n_heads_per_shard = n_heads // num_shards 67 | dim = params["dim"] 68 | dims_per_head = dim // n_heads 69 | base = 10000.0 70 | inv_freq = 1.0 / ( 71 | base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head) 72 | ) 73 | 74 | # permute for sliced rotary 75 | def permute(w): 76 | return ( 77 | w.view(n_heads, dim // n_heads // 2, 2, dim) 78 | .transpose(1, 2) 79 | .reshape(dim, dim) 80 | ) 81 | 82 | # Load weights 83 | if model_size == "7B": 84 | # Not shared 85 | # (The sharded implementation would also work, but this is simpler.) 86 | loaded = torch.load( 87 | os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu" 88 | ) 89 | else: 90 | # Sharded 91 | loaded = [ 92 | torch.load( 93 | os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), 94 | map_location="cpu", 95 | ) 96 | for i in range(num_shards) 97 | ] 98 | param_count = 0 99 | index_dict = {"weight_map": {}} 100 | for layer_i in range(n_layers): 101 | filename = "pytorch_model-{:05d}-of-{:05d}.bin".format( 102 | layer_i + 1, 103 | n_layers + 1, 104 | ) 105 | if model_size == "7B": 106 | # Unsharded 107 | state_dict = { 108 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( 109 | loaded[f"layers.{layer_i}.attention.wq.weight"] 110 | ), 111 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( 112 | loaded[f"layers.{layer_i}.attention.wk.weight"] 113 | ), 114 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[ 115 | f"layers.{layer_i}.attention.wv.weight" 116 | ], 117 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[ 118 | f"layers.{layer_i}.attention.wo.weight" 119 | ], 120 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[ 121 | f"layers.{layer_i}.feed_forward.w1.weight" 122 | ], 123 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[ 124 | f"layers.{layer_i}.feed_forward.w2.weight" 125 | ], 126 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[ 127 | f"layers.{layer_i}.feed_forward.w3.weight" 128 | ], 129 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[ 130 | f"layers.{layer_i}.attention_norm.weight" 131 | ], 132 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ 133 | f"layers.{layer_i}.ffn_norm.weight" 134 | ], 135 | } 136 | else: 137 | # Sharded 138 | state_dict = { 139 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ 140 | f"layers.{layer_i}.attention_norm.weight" 141 | ], 142 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ 143 | f"layers.{layer_i}.ffn_norm.weight" 144 | ], 145 | } 146 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( 147 | torch.cat( 148 | [ 149 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view( 150 | n_heads_per_shard, dims_per_head, dim 151 | ) 152 | for i in range(num_shards) 153 | ], 154 | dim=0, 155 | ).reshape(dim, dim) 156 | ) 157 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( 158 | torch.cat( 159 | [ 160 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( 161 | n_heads_per_shard, dims_per_head, dim 162 | ) 163 | for i in range(num_shards) 164 | ], 165 | dim=0, 166 | ).reshape(dim, dim) 167 | ) 168 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( 169 | [ 170 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( 171 | n_heads_per_shard, dims_per_head, dim 172 | ) 173 | for i in range(num_shards) 174 | ], 175 | dim=0, 176 | ).reshape(dim, dim) 177 | 178 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( 179 | [ 180 | loaded[i][f"layers.{layer_i}.attention.wo.weight"] 181 | for i in range(num_shards) 182 | ], 183 | dim=1, 184 | ) 185 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( 186 | [ 187 | loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] 188 | for i in range(num_shards) 189 | ], 190 | dim=0, 191 | ) 192 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( 193 | [ 194 | loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] 195 | for i in range(num_shards) 196 | ], 197 | dim=1, 198 | ) 199 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( 200 | [ 201 | loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] 202 | for i in range(num_shards) 203 | ], 204 | dim=0, 205 | ) 206 | 207 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq 208 | for k, v in state_dict.items(): 209 | index_dict["weight_map"][k] = filename 210 | param_count += v.numel() 211 | torch.save(state_dict, os.path.join(model_path, filename)) 212 | 213 | filename = "pytorch_model-{:05d}-of-{:05d}.bin".format( 214 | n_layers + 1, 215 | n_layers + 1, 216 | ) 217 | if model_size == "7B": 218 | # Unsharded 219 | state_dict = { 220 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"], 221 | "model.norm.weight": loaded["norm.weight"], 222 | "lm_head.weight": loaded["output.weight"], 223 | } 224 | else: 225 | state_dict = { 226 | "model.norm.weight": loaded[0]["norm.weight"], 227 | "model.embed_tokens.weight": torch.cat( 228 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 229 | ), 230 | "lm_head.weight": torch.cat( 231 | [loaded[i]["output.weight"] for i in range(num_shards)], dim=0 232 | ), 233 | } 234 | 235 | for k, v in state_dict.items(): 236 | index_dict["weight_map"][k] = filename 237 | param_count += v.numel() 238 | torch.save(state_dict, os.path.join(model_path, filename)) 239 | 240 | # Write configs 241 | index_dict["metadata"] = {"total_size": param_count * 2} 242 | write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json")) 243 | config_out = { 244 | "architectures": ["LLaMAForCausalLM"], 245 | "bos_token_id": 0, 246 | "eos_token_id": 1, 247 | "hidden_act": "silu", 248 | "hidden_size": params["dim"], 249 | "intermediate_size": INTERMEDIATE_SIZE_MAP[model_size], 250 | "initializer_range": 0.02, 251 | "max_sequence_length": 2048, 252 | "model_type": "llama", 253 | "num_attention_heads": params["n_heads"], 254 | "num_hidden_layers": params["n_layers"], 255 | "pad_token_id": -1, 256 | "rms_norm_eps": params["norm_eps"], 257 | "torch_dtype": "float16", 258 | "transformers_version": "4.27.0.dev0", 259 | "use_cache": True, 260 | "vocab_size": 32000, 261 | } 262 | write_json( 263 | config_out, 264 | os.path.join(model_path, "config.json"), 265 | ) 266 | generation_config = { 267 | "_from_model_config": True, 268 | "bos_token_id": 0, 269 | "eos_token_id": 1, 270 | "pad_token_id": 0, 271 | "transformers_version": "4.27.0.dev0", 272 | } 273 | write_json( 274 | generation_config, 275 | os.path.join(model_path, "generation_config.json"), 276 | ) 277 | 278 | 279 | def write_tokenizer(tokenizer_path, input_tokenizer_path): 280 | os.makedirs(tokenizer_path, exist_ok=True) 281 | write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json")) 282 | write_json( 283 | { 284 | "bos_token": "", 285 | "eos_token": "", 286 | "model_max_length": int(1e30), 287 | "tokenizer_class": "LLaMATokenizer", 288 | "unk_token": "", 289 | }, 290 | os.path.join(tokenizer_path, "tokenizer_config.json"), 291 | ) 292 | shutil.copyfile( 293 | input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model") 294 | ) 295 | 296 | 297 | def convert_llama_fb(args): 298 | from pathlib import Path 299 | from tqdm import tqdm 300 | from llama import ModelArgs, Tokenizer, Transformer 301 | output_dir = os.path.join(args.output_dir, args.model_size) 302 | os.makedirs(output_dir, exist_ok=True) 303 | 304 | if "tokenizer.model" not in os.listdir(output_dir): 305 | shutil.copy(args.tokenizer_path, args.output_dir) 306 | 307 | tokenizer_path = os.path.join(args.output_dir, "tokenizer.model") 308 | 309 | cks = sorted(Path(args.ckpt_dir).glob("*.pth")) 310 | params = hiq.read_file(Path(args.ckpt_dir) / "params.json",as_json=True) 311 | model_args = ModelArgs(max_seq_len=2048, max_batch_size=args.max_batch_size, **params) 312 | tokenizer = Tokenizer(model_path=tokenizer_path) 313 | model_args.vocab_size = tokenizer.n_words 314 | 315 | torch.set_default_tensor_type(torch.HalfTensor) 316 | print(f"⌛️ Loading model...Thank you for your patience...") 317 | model = Transformer(model_args) 318 | torch.set_default_tensor_type(torch.FloatTensor) 319 | dt = {} 320 | print(f"⌛️ Converting model...Thank you for your patience...") 321 | for i, ckpt in tqdm(enumerate(cks), total=len(cks)): 322 | ck = torch.load(ckpt, map_location="cpu") 323 | for nm, pm in model.named_parameters(): 324 | if nm not in dt: 325 | dt[nm] = torch.zeros_like(pm, device="cpu") 326 | short_name = nm.split(".")[-2] 327 | if META_KEY_TO_DIM[short_name] is None and i == 0: 328 | dt[nm] = ck[nm] 329 | elif META_KEY_TO_DIM[short_name] == 0: 330 | size = ck[nm].size(0) 331 | dt[nm][size * i: size * (i + 1), :] = ck[nm] 332 | elif META_KEY_TO_DIM[short_name] == -1: 333 | size = ck[nm].size(-1) 334 | dt[nm][:, size * i: size * (i + 1)] = ck[nm] 335 | hiq.write_file(os.path.join(output_dir, "params.json"), json.dumps(params, indent=4)) 336 | torch.save(dt, os.path.join(output_dir, "state_dict.pt")) 337 | 338 | 339 | def convert_llama_hf(args): 340 | write_model( 341 | model_path=os.path.join( 342 | args.output_dir, "llama-{}".format(args.model_size).lower() 343 | ), 344 | input_base_path=args.ckpt_dir, 345 | model_size=args.model_size, 346 | ) 347 | write_tokenizer( 348 | tokenizer_path=os.path.join(args.output_dir, "tokenizer"), 349 | input_tokenizer_path=args.tokenizer_path, 350 | ) 351 | 352 | def get_args(): 353 | parser = argparse.ArgumentParser() 354 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 355 | parser.add_argument( 356 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 357 | ) 358 | parser.add_argument( 359 | "--model_size", 360 | choices=NUM_SHARDS.keys(), 361 | ) 362 | parser.add_argument( 363 | "--output_dir", 364 | help="Location to write HF model and tokenizer", 365 | ) 366 | parser.add_argument( 367 | "--max_batch_size", type=int, default=2 368 | ) 369 | parser.add_argument("--to", choices={"fb", "hf"}) 370 | return parser.parse_args() 371 | 372 | 373 | 374 | if __name__ == "__main__": 375 | args = get_args() 376 | if args.to == "hf": 377 | convert_llama_hf(args) 378 | elif args.to == "fb": 379 | convert_llama_fb(args) 380 | else: 381 | print(f"wrong argument: {args.to}") 382 | -------------------------------------------------------------------------------- /llama/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | from threading import Thread 3 | 4 | 5 | here = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def download(args=None): 9 | import hiq 10 | 11 | cmd = f"bash {here}/download_community.sh" 12 | if args is not None: 13 | if args.model_size: 14 | cmd += f" {args.model_size}" 15 | if args.folder: 16 | cmd += f" {args.folder}" 17 | retcode = hiq.execute_cmd(cmd, verbose=False, shell=True, runtime_output=True, env=os.environ) 18 | if retcode != 0: 19 | # retry 20 | download(args) 21 | 22 | 23 | def download_watchdog(args): 24 | def watch(): 25 | import time 26 | 27 | # every 120s, check total file size under folder to see if it increases as the download speed suggests. if not, restart download 28 | folder = args.folder if args.folder else "pyllama_data" 29 | last_total_size = -1 30 | while True: 31 | total_size = 0 32 | for dirpath, _, filenames in os.walk(folder): 33 | for f in filenames: 34 | fp = os.path.join(dirpath, f) 35 | total_size += os.path.getsize(fp) 36 | size_changed_mb = (total_size - last_total_size) / 1024 / 1024 37 | if last_total_size != -1 and size_changed_mb < 30 * args.download_speed_mb: 38 | print( 39 | f"Download watchdog: total file size {total_size / 1024 / 1024:.2f}MB increased too slow ({size_changed_mb:.2f}MB in the last 30s), restarting download" 40 | ) 41 | import hiq 42 | 43 | cmd = f"bash {here}/download_community_stop.sh" 44 | hiq.execute_cmd(cmd, verbose=False, shell=True, runtime_output=True) 45 | else: 46 | if last_total_size != -1: 47 | print( 48 | f"Download watchdog: total file size increased normally at speed {size_changed_mb / 30:.2f}MB/s" 49 | ) 50 | last_total_size = total_size 51 | time.sleep(120) 52 | 53 | watch_thread = Thread(target=watch, daemon=True) 54 | watch_thread.start() 55 | 56 | 57 | def get_args(): 58 | import argparse 59 | 60 | parser = argparse.ArgumentParser() 61 | 62 | parser.add_argument( 63 | "--model_size", 64 | type=str, 65 | default="7B,13B,30B,65B", 66 | help='The size of the models that you want to download. A comma separated string of any of "7B", "13B", "30B", "65B". Totally 219G disk space is needed to download them all. If you only want to download the 7B model, just put "7B" here.', 67 | ) 68 | parser.add_argument( 69 | "--folder", 70 | type=str, 71 | default="pyllama_data", 72 | help="The target folder for the download files", 73 | ) 74 | parser.add_argument( 75 | "--download_speed_mb", 76 | type=int, 77 | default=1, 78 | help="The accepted download speed in MB/s. If the download speed is lower than this, the download will be restarted.", 79 | ) 80 | args = parser.parse_args() 81 | return args 82 | 83 | 84 | if __name__ == "__main__": 85 | args = get_args() 86 | download_watchdog(args) 87 | download(args) 88 | -------------------------------------------------------------------------------- /llama/download_community.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | PRESIGNED_URL="https://agi.gpt4.org/llama/LLaMA" 5 | ALL_MODELS=7B,13B,30B,65B 6 | 7 | YELLOW=$(tput setaf 3) 8 | RED=$(tput setaf 1) 9 | CLEAR=$(tput sgr0) 10 | 11 | function usage { 12 | cat <] [] 14 | 15 | Download the given llama to . By default, will 16 | download all available models into the current directory 17 | 18 | OPTIONS 19 | 20 | -v, --verbose: enable verbose mode 21 | -h, --help: print this help and exit 22 | 23 | EXAMPLES 24 | 25 | Download all models ($ALL_MODELS) into the current directory 26 | 27 | ./download_community.sh 28 | 29 | Download the 7B and 13B parameter models to /usr/share/llama 30 | 31 | ./download_community.sh 7B,13B /usr/share/llama 32 | 33 | EOF 34 | exit 1 35 | } 36 | 37 | # print its argument in red and quit 38 | function die { 39 | printf "%s%s%s\n" "$RED" "$1" "$CLEAR" 40 | exit 1 41 | } 42 | 43 | # print its argument in yellow 44 | function log { 45 | printf "\n%s%s%s\n" "$YELLOW" "$1" "$CLEAR" 46 | } 47 | 48 | # download a file with a progress bar, then display a success message. Takes 49 | # two arguments: the URL and the output file name 50 | function download { 51 | if ! wget --continue --progress=bar:force "$1" -O "$2"; then 52 | die "failed to download $1 -> $2" 53 | fi 54 | echo ✅ "$2" 55 | } 56 | 57 | # change into the model directory and use md5sum -c to verify the checksums of 58 | # the model files within. Uses a subshell to avoid changing the script's 59 | # direcotry 60 | function verify { 61 | (cd "$1" && md5sum -c "$2") 62 | } 63 | 64 | # return the number of shards for a given model. Bash 3 doesn't support 65 | # associative arrays, so use a case statement instead. 66 | function nshards { 67 | case $1 in 68 | 7B) 69 | echo 0 70 | ;; 71 | 13B) 72 | echo 1 73 | ;; 74 | 30B) 75 | echo 3 76 | ;; 77 | 65B) 78 | echo 7 79 | ;; 80 | *) 81 | die "invalid argument to nshards: $1" 82 | ;; 83 | esac 84 | 85 | } 86 | 87 | # check for wget - if it's not present print an error 88 | if ! command -v wget &> /dev/null 89 | then 90 | die "wget not found. You must have wget installed and on your path to run this script" 91 | fi 92 | 93 | # parse the optional flags and discard them 94 | while true; do 95 | case $1 in 96 | -v|--verbose) 97 | set -x 98 | shift 99 | ;; 100 | -h|--help|help) 101 | usage 102 | ;; 103 | *) 104 | break 105 | ;; 106 | esac 107 | done 108 | 109 | # MODELS_TO_DOWNLOAD is a comma-separated list of models the user wants to 110 | # download, which defaults to all models. Split it into an array called MODELS 111 | MODELS_TO_DOWNLOAD=${1:-$ALL_MODELS} 112 | IFS="," read -r -a MODELS <<< "$MODELS_TO_DOWNLOAD" 113 | 114 | # TARGET_FOLDER is the root directory to download the models to 115 | TARGET_FOLDER=${2:-.} 116 | 117 | log "❤️ Resume download is supported. You can ctrl-c and rerun the program to resume the downloading" 118 | 119 | # ensure the targeted directory exists 120 | mkdir -p "$TARGET_FOLDER" 121 | 122 | log "Downloading tokenizer..." 123 | download "$PRESIGNED_URL/tokenizer.model" "$TARGET_FOLDER/tokenizer.model" 124 | download "$PRESIGNED_URL/tokenizer_checklist.chk" "$TARGET_FOLDER/tokenizer_checklist.chk" 125 | verify "$TARGET_FOLDER" tokenizer_checklist.chk 126 | 127 | # for each model, download each of its shards and then verify the checksums 128 | for model in "${MODELS[@]}" 129 | do 130 | log "Downloading $model" 131 | mkdir -p "$TARGET_FOLDER/$model" 132 | 133 | # download each shard in the model 134 | for s in $(seq -f "0%g" 0 "$(nshards "$model")") 135 | do 136 | fout="$TARGET_FOLDER/$model/consolidated.$s.pth" 137 | log "downloading file to $fout ...please wait for a few minutes ..." 138 | download "$PRESIGNED_URL/$model/consolidated.$s.pth" "$fout" 139 | done 140 | 141 | # download the params and checksums 142 | download "$PRESIGNED_URL/$model/params.json" "$TARGET_FOLDER/$model/params.json" 143 | download "$PRESIGNED_URL/$model/checklist.chk" "$TARGET_FOLDER/$model/checklist.chk" 144 | 145 | log "Checking checksums for the $model model" 146 | verify "$TARGET_FOLDER/$model" checklist.chk 147 | done 148 | -------------------------------------------------------------------------------- /llama/download_community_stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ps aux | grep 'wget --continue --progress=bar:force https://agi.gpt4.org/llama/LLaMA/' | grep -v grep | awk '{print $2}' | xargs kill 3 | ps aux | grep '.*llama/download_community.sh' | grep -v grep | awk '{print $2}' | xargs kill 4 | -------------------------------------------------------------------------------- /llama/generation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from typing import List 5 | 6 | import torch 7 | 8 | from llama.tokenizer import Tokenizer 9 | 10 | 11 | class LLaMA: 12 | def __init__(self, model, tokenizer: Tokenizer): 13 | self.model = model 14 | self.tokenizer = tokenizer 15 | 16 | def _should_stop(self, tokens, prompt_tokens, stop_ids, stop_words): 17 | """credits go to: https://github.com/galatolofederico/vanilla-llama""" 18 | if stop_ids is not None: 19 | do_stop = [False for _ in range(len(tokens))] 20 | for i, (t, p) in enumerate(zip(tokens, prompt_tokens)): 21 | g = t[len(p):].tolist() 22 | for stop_id in stop_ids: 23 | if stop_id in g: 24 | do_stop[i] = True 25 | 26 | if all(do_stop): 27 | return True 28 | 29 | if stop_words is not None: 30 | do_stop = [False for _ in range(len(tokens))] 31 | for i, (t, p) in enumerate(zip(tokens, prompt_tokens)): 32 | t = t.clone() 33 | g = t[len(p):] 34 | g[g == self.tokenizer.pad_id] = self.tokenizer.eos_id 35 | g = g.tolist() 36 | d = self.tokenizer.decode(g) 37 | for stop_word in stop_words: 38 | if stop_word in d: 39 | do_stop[i] = True 40 | 41 | if all(do_stop): 42 | return True 43 | 44 | return False 45 | 46 | def generate( 47 | self, 48 | prompts: List[str], 49 | max_gen_len: int, 50 | temperature: float = 0.8, 51 | top_p: float = 0.95, 52 | stop_ids: List[int] = None, 53 | stop_words: List[str] = None, 54 | ) -> List[str]: 55 | bsz = len(prompts) 56 | params = self.model.params 57 | assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) 58 | 59 | prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts] 60 | 61 | min_prompt_size = min([len(t) for t in prompt_tokens]) 62 | max_prompt_size = max([len(t) for t in prompt_tokens]) 63 | 64 | total_len = min(params.max_seq_len, max_gen_len + max_prompt_size) 65 | 66 | tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long() 67 | for k, t in enumerate(prompt_tokens): 68 | tokens[k, : len(t)] = torch.tensor(t).long() 69 | input_text_mask = tokens != self.tokenizer.pad_id 70 | start_pos = min_prompt_size 71 | prev_pos = 0 72 | for cur_pos in range(start_pos, total_len): 73 | i = tokens[:, prev_pos:cur_pos] 74 | logits = self.model(i, prev_pos) 75 | if temperature > 0: 76 | probs = torch.softmax(logits / temperature, dim=-1) 77 | next_token = sample_top_p(probs, top_p) 78 | else: 79 | next_token = torch.argmax(logits, dim=-1) 80 | next_token = next_token.reshape(-1) 81 | # only replace token if prompt has already been generated 82 | next_token = torch.where( 83 | input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token 84 | ) 85 | tokens[:, cur_pos] = next_token 86 | prev_pos = cur_pos 87 | 88 | if self._should_stop(tokens, prompt_tokens, stop_ids, stop_words): 89 | break 90 | 91 | tokens[tokens == self.tokenizer.pad_id] = self.tokenizer.eos_id 92 | decoded = [] 93 | for i, t in enumerate(tokens.tolist()): 94 | # cut to max gen len 95 | t = t[: len(prompt_tokens[i]) + max_gen_len] 96 | # cut to eos tok if any 97 | try: 98 | t = t[: t.index(self.tokenizer.eos_id)] 99 | except ValueError: 100 | pass 101 | decoded.append(self.tokenizer.decode(t)) 102 | #print(decoded) 103 | return [postprocessing(i, stop_words) for i in decoded] 104 | 105 | 106 | def postprocessing(output_text, stop_words=None, threshold=10): 107 | sentences = output_text.split(".") 108 | filtered_sentences = [] 109 | for sentence in sentences: 110 | sentence = sentence.strip() 111 | if len(sentence) > threshold and sentence[-1] == ".": 112 | filtered_sentences.append(sentence) 113 | r = '.'.join(sentences).strip() 114 | if stop_words: 115 | for w in stop_words: 116 | if r.endswith(w): 117 | r = r[0:-len(w)].strip() 118 | if r[-1] != '.': 119 | r += '...' 120 | return r 121 | 122 | 123 | def sample_top_p(probs, p): 124 | probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) 125 | probs_sum = torch.cumsum(probs_sort, dim=-1) 126 | mask = probs_sum - probs_sort > p 127 | probs_sort[mask] = 0.0 128 | probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) 129 | next_token = torch.multinomial(probs_sort, num_samples=1) 130 | next_token = torch.gather(probs_idx, -1, next_token) 131 | return next_token 132 | -------------------------------------------------------------------------------- /llama/hf/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_torch_available, 20 | is_sentencepiece_available, 21 | ) 22 | 23 | 24 | _import_structure = { 25 | "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LLaMAConfig"], 26 | } 27 | 28 | try: 29 | if not is_sentencepiece_available(): 30 | raise OptionalDependencyNotAvailable() 31 | except OptionalDependencyNotAvailable: 32 | pass 33 | else: 34 | _import_structure["tokenization_llama"] = ["LLaMATokenizer"] 35 | 36 | try: 37 | if not is_torch_available(): 38 | raise OptionalDependencyNotAvailable() 39 | except OptionalDependencyNotAvailable: 40 | pass 41 | else: 42 | _import_structure["modeling_llama"] = [ 43 | "LLaMAForCausalLM", 44 | "LLaMAModel", 45 | "LLaMAPreTrainedModel", 46 | ] 47 | 48 | 49 | if TYPE_CHECKING: 50 | from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LLaMAConfig 51 | 52 | try: 53 | if not is_sentencepiece_available(): 54 | raise OptionalDependencyNotAvailable() 55 | except OptionalDependencyNotAvailable: 56 | pass 57 | else: 58 | from .tokenization_llama import LLaMATokenizer 59 | 60 | try: 61 | if not is_torch_available(): 62 | raise OptionalDependencyNotAvailable() 63 | except OptionalDependencyNotAvailable: 64 | pass 65 | else: 66 | from .modeling_llama import ( 67 | LLaMAForCausalLM, 68 | LLaMAModel, 69 | LLaMAPreTrainedModel, 70 | ) 71 | 72 | 73 | else: 74 | import sys 75 | 76 | sys.modules[__name__] = _LazyModule( 77 | __name__, globals()["__file__"], _import_structure, module_spec=__spec__ 78 | ) 79 | -------------------------------------------------------------------------------- /llama/hf/configuration_llama.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ LLaMA model configuration""" 21 | 22 | from transformers.configuration_utils import PretrainedConfig 23 | from transformers.utils import logging 24 | 25 | 26 | logger = logging.get_logger(__name__) 27 | 28 | LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 29 | 30 | 31 | class LLaMAConfig(PretrainedConfig): 32 | r""" 33 | This is the configuration class to store the configuration of a [`~LLaMAModel`]. It is used to instantiate an LLaMA 34 | model according to the specified arguments, defining the model architecture. Instantiating a configuration with the 35 | defaults will yield a similar configuration to that of the LLaMA-7B. 36 | 37 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 38 | documentation from [`PretrainedConfig`] for more information. 39 | 40 | 41 | Args: 42 | vocab_size (`int`, *optional*, defaults to 32000): 43 | Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the 44 | `inputs_ids` passed when calling [`~LLaMAModel`] or [`~TFLLaMAModel`]. 45 | hidden_size (`int`, *optional*, defaults to 4096): 46 | Dimension of the hidden representations. 47 | intermediate_size (`int`, *optional*, defaults to 11008): 48 | Dimension of the MLP representations. 49 | num_hidden_layers (`int`, *optional*, defaults to 32): 50 | Number of hidden layers in the Transformer encoder. 51 | num_attention_heads (`int`, *optional*, defaults to 32): 52 | Number of attention heads for each attention layer in the Transformer encoder. 53 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): 54 | The non-linear activation function (function or string) in the decoder. 55 | initializer_range (`float`, *optional*, defaults to 0.02): 56 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 57 | rms_norm_eps (`float`, *optional*, defaults to 1e-12): 58 | The epsilon used by the rms normalization layers. 59 | use_cache (`bool`, *optional*, defaults to `True`): 60 | Whether or not the model should return the last key/values attentions (not used by all models). Only 61 | relevant if `config.is_decoder=True`. 62 | tie_word_embeddings(`bool`, *optional*, defaults to `False`): 63 | Whether to tie weight embeddings 64 | Example: 65 | 66 | ```python 67 | >>> from transformers import LLaMAModel, LLaMAConfig 68 | 69 | >>> # Initializing a LLaMA llama-7b style configuration 70 | >>> configuration = LLaMAConfig() 71 | 72 | >>> # Initializing a model from the llama-7b style configuration 73 | >>> model = LLaMAModel(configuration) 74 | 75 | >>> # Accessing the model configuration 76 | >>> configuration = model.config 77 | ```""" 78 | model_type = "llama" 79 | 80 | def __init__( 81 | self, 82 | vocab_size=32000, 83 | hidden_size=4096, 84 | intermediate_size=11008, 85 | num_hidden_layers=32, 86 | num_attention_heads=32, 87 | hidden_act="silu", 88 | initializer_range=0.02, 89 | rms_norm_eps=1e-6, 90 | use_cache=True, 91 | pad_token_id=-1, 92 | bos_token_id=0, 93 | eos_token_id=1, 94 | tie_word_embeddings=False, 95 | **kwargs, 96 | ): 97 | self.vocab_size = vocab_size 98 | self.hidden_size = hidden_size 99 | self.intermediate_size = intermediate_size 100 | self.num_hidden_layers = num_hidden_layers 101 | self.num_attention_heads = num_attention_heads 102 | self.hidden_act = hidden_act 103 | self.initializer_range = initializer_range 104 | self.rms_norm_eps = rms_norm_eps 105 | self.use_cache = use_cache 106 | super().__init__( 107 | pad_token_id=pad_token_id, 108 | bos_token_id=bos_token_id, 109 | eos_token_id=eos_token_id, 110 | tie_word_embeddings=tie_word_embeddings, 111 | **kwargs, 112 | ) 113 | -------------------------------------------------------------------------------- /llama/hf/tokenization_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 2 | # 3 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 4 | # and OPT implementations in this library. It has been modified from its 5 | # original forms to accommodate minor architectural differences compared 6 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | 20 | """Tokenization classes for LLaMA.""" 21 | import os 22 | import re 23 | from shutil import copyfile 24 | from typing import Any, Dict, List, Optional, Tuple 25 | 26 | import sentencepiece as spm 27 | 28 | from transformers.tokenization_utils import PreTrainedTokenizer 29 | from transformers.utils import logging 30 | 31 | 32 | logger = logging.get_logger(__name__) 33 | 34 | VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} 35 | 36 | PRETRAINED_VOCAB_FILES_MAP = {} 37 | 38 | 39 | class LLaMATokenizer(PreTrainedTokenizer): 40 | """ 41 | Construct a LLaMA tokenizer. Based on byte-level Byte-Pair-Encoding. 42 | 43 | Args: 44 | vocab_file (`str`): 45 | Path to the vocabulary file. 46 | """ 47 | 48 | vocab_files_names = VOCAB_FILES_NAMES 49 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 50 | model_input_names = ["input_ids", "attention_mask"] 51 | 52 | def __init__( 53 | self, 54 | vocab_file, 55 | unk_token="", 56 | bos_token=" ⁇ ", 57 | eos_token="", 58 | sp_model_kwargs: Optional[Dict[str, Any]] = None, 59 | add_bos_token=True, 60 | add_eos_token=False, 61 | **kwargs, 62 | ): 63 | self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs 64 | super().__init__( 65 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs 66 | ) 67 | self.vocab_file = vocab_file 68 | self.add_bos_token = add_bos_token 69 | self.add_eos_token = add_eos_token 70 | self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) 71 | self.sp_model.Load(vocab_file) 72 | 73 | """ Initialisation""" 74 | 75 | @property 76 | def vocab_size(self): 77 | """Returns vocab size""" 78 | return self.sp_model.get_piece_size() 79 | 80 | @property 81 | def bos_token_id(self) -> Optional[int]: 82 | return self.sp_model.bos_id() 83 | 84 | @property 85 | def eos_token_id(self) -> Optional[int]: 86 | return self.sp_model.eos_id() 87 | 88 | def get_vocab(self): 89 | """Returns vocab as a dict""" 90 | vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} 91 | vocab.update(self.added_tokens_encoder) 92 | return vocab 93 | 94 | def _tokenize(self, text): 95 | """Returns a tokenized string.""" 96 | return self.sp_model.encode(text, out_type=str) 97 | 98 | def _convert_token_to_id(self, token): 99 | """Converts a token (str) in an id using the vocab.""" 100 | return self.sp_model.piece_to_id(token) 101 | 102 | def _convert_id_to_token(self, index): 103 | """Converts an index (integer) in a token (str) using the vocab.""" 104 | token = self.sp_model.IdToPiece(index) 105 | return token 106 | 107 | def convert_tokens_to_string(self, tokens): 108 | """Converts a sequence of tokens (string) in a single string.""" 109 | current_sub_tokens = [] 110 | out_string = "" 111 | prev_is_special = False 112 | for token in tokens: 113 | # make sure that special tokens are not decoded using sentencepiece model 114 | if token in self.all_special_tokens: 115 | if not prev_is_special: 116 | out_string += " " 117 | out_string += self.sp_model.decode(current_sub_tokens) + token 118 | prev_is_special = True 119 | current_sub_tokens = [] 120 | else: 121 | current_sub_tokens.append(token) 122 | prev_is_special = False 123 | out_string += self.sp_model.decode(current_sub_tokens) 124 | return out_string.strip() 125 | 126 | def save_vocabulary( 127 | self, save_directory, filename_prefix: Optional[str] = None 128 | ) -> Tuple[str]: 129 | """ 130 | Save the vocabulary and special tokens file to a directory. 131 | 132 | Args: 133 | save_directory (`str`): 134 | The directory in which to save the vocabulary. 135 | 136 | Returns: 137 | `Tuple(str)`: Paths to the files saved. 138 | """ 139 | if not os.path.isdir(save_directory): 140 | logger.error(f"Vocabulary path ({save_directory}) should be a directory") 141 | return 142 | out_vocab_file = os.path.join( 143 | save_directory, 144 | (filename_prefix + "-" if filename_prefix else "") 145 | + VOCAB_FILES_NAMES["vocab_file"], 146 | ) 147 | 148 | if os.path.abspath(self.vocab_file) != os.path.abspath( 149 | out_vocab_file 150 | ) and os.path.isfile(self.vocab_file): 151 | copyfile(self.vocab_file, out_vocab_file) 152 | elif not os.path.isfile(self.vocab_file): 153 | with open(out_vocab_file, "wb") as fi: 154 | content_spiece_model = self.sp_model.serialized_model_proto() 155 | fi.write(content_spiece_model) 156 | 157 | return (out_vocab_file,) 158 | 159 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 160 | if self.add_bos_token: 161 | bos_token_ids = [self.bos_token_id] 162 | else: 163 | bos_token_ids = [] 164 | 165 | output = bos_token_ids + token_ids_0 166 | 167 | if token_ids_1 is not None: 168 | output = output + token_ids_1 169 | 170 | if self.add_eos_token: 171 | output = output + [self.eos_token_id] 172 | 173 | return output 174 | 175 | def get_special_tokens_mask( 176 | self, 177 | token_ids_0: List[int], 178 | token_ids_1: Optional[List[int]] = None, 179 | already_has_special_tokens: bool = False, 180 | ) -> List[int]: 181 | """ 182 | Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding 183 | special tokens using the tokenizer `prepare_for_model` method. 184 | 185 | Args: 186 | token_ids_0 (`List[int]`): 187 | List of IDs. 188 | token_ids_1 (`List[int]`, *optional*): 189 | Optional second list of IDs for sequence pairs. 190 | already_has_special_tokens (`bool`, *optional*, defaults to `False`): 191 | Whether or not the token list is already formatted with special tokens for the model. 192 | 193 | Returns: 194 | `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 195 | """ 196 | if already_has_special_tokens: 197 | return super().get_special_tokens_mask( 198 | token_ids_0=token_ids_0, 199 | token_ids_1=token_ids_1, 200 | already_has_special_tokens=True, 201 | ) 202 | 203 | if token_ids_1 is None: 204 | return [1] + ([0] * len(token_ids_0)) + [1] 205 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 206 | 207 | def create_token_type_ids_from_sequences( 208 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 209 | ) -> List[int]: 210 | """ 211 | Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make 212 | use of token type ids, therefore a list of zeros is returned. 213 | 214 | Args: 215 | token_ids_0 (`List[int]`): 216 | List of IDs. 217 | token_ids_1 (`List[int]`, *optional*): 218 | Optional second list of IDs for sequence pairs. 219 | 220 | Returns: 221 | `List[int]`: List of zeros. 222 | """ 223 | eos = [self.eos_token_id] 224 | 225 | if token_ids_1 is None: 226 | return len(token_ids_0 + eos) * [0] 227 | return len(token_ids_0 + eos + token_ids_1 + eos) * [0] 228 | -------------------------------------------------------------------------------- /llama/hf/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .modeling_llama import LLaMAForCausalLM 3 | 4 | 5 | def non_ops(*args, **kwargs): 6 | pass 7 | 8 | 9 | def avoid_tensor_modified(): 10 | torch.nn.init.kaiming_uniform_ = non_ops 11 | torch.nn.init.uniform_ = non_ops 12 | torch.nn.init.normal_ = non_ops 13 | 14 | 15 | def get_llama(model, seqlen=1024): 16 | avoid_tensor_modified() 17 | model = LLaMAForCausalLM.from_pretrained(model, torch_dtype="auto") 18 | model.seqlen = seqlen 19 | return model 20 | -------------------------------------------------------------------------------- /llama/llama_infer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from llama.hf import LLaMATokenizer 4 | from llama.hf.utils import get_llama 5 | from llama.llama_quant import load_quant 6 | 7 | 8 | def get_args(): 9 | import argparse 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument("--model", type=str, default="decapoda-research/llama-7b-hf", help="llama model to load") 14 | parser.add_argument( 15 | "--wbits", 16 | type=int, 17 | default=16, 18 | choices=[2, 3, 4, 8, 16], 19 | help="#bits to use for quantization; use 16 for evaluating base model.", 20 | ) 21 | parser.add_argument("--load", type=str, default="", help="Load quantized model.") 22 | parser.add_argument("--text", type=str, help="input text") 23 | parser.add_argument( 24 | "--min_length", 25 | type=int, 26 | default=10, 27 | help="The minimum length of the sequence to be generated.", 28 | ) 29 | parser.add_argument( 30 | "--seqlen", 31 | type=int, 32 | default=1024, 33 | help="The maximum length of the input sequence that LLaMA can process.", 34 | ) 35 | parser.add_argument( 36 | "--max_length", 37 | type=int, 38 | default=50, 39 | help="The maximum length of the output sequence to be generated.", 40 | ) 41 | 42 | parser.add_argument( 43 | "--top_p", 44 | type=float, 45 | default=0.95, 46 | help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", 47 | ) 48 | 49 | parser.add_argument( 50 | "--temperature", 51 | type=float, 52 | default=0.8, 53 | help="The value used to module the next token probabilities.", 54 | ) 55 | parser.add_argument( 56 | "--cuda", type=str, default="cuda:0", help="GPU device string, eg cuda:0." 57 | ) 58 | args = parser.parse_args() 59 | return args 60 | 61 | 62 | def run(args=None): 63 | args = args or get_args() 64 | if args.load: 65 | model = load_quant(args.model, args.load, args.wbits, args.seqlen) 66 | else: 67 | model = get_llama(args.model) 68 | model.eval() 69 | if args.cuda.startswith("cuda"): 70 | dev = torch.device(args.cuda) 71 | else: 72 | dev = torch.device("cpu") 73 | 74 | model.to(dev) 75 | tokenizer = LLaMATokenizer.from_pretrained(args.model) 76 | input_ids = tokenizer.encode(args.text, return_tensors="pt").to(dev) 77 | 78 | with torch.no_grad(): 79 | generated_ids = model.generate( 80 | input_ids, 81 | do_sample=True, 82 | min_length=args.min_length, 83 | max_length=args.max_length, 84 | top_p=args.top_p, 85 | temperature=args.temperature, 86 | ) 87 | print("*"*80) 88 | print("🦙:", tokenizer.decode([el.item() for el in generated_ids[0]])) 89 | 90 | 91 | if __name__ == "__main__": 92 | run() 93 | -------------------------------------------------------------------------------- /llama/llama_multigpu.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import torch 4 | from accelerate import init_empty_weights, load_checkpoint_and_dispatch 5 | from tqdm import tqdm 6 | from pathlib import Path 7 | import hiq 8 | import os 9 | from llama import ModelArgs, Tokenizer, Transformer, LLaMA 10 | 11 | NUM_SHARDS = { 12 | "7B": 1, 13 | "13B": 2, 14 | "30B": 4, 15 | "65B": 8, 16 | } 17 | 18 | class LLaMAInference: 19 | def __init__(self, state_dict_dir, model_size, device_map="auto", **kwargs): 20 | 21 | state_dict = os.path.join(state_dict_dir, model_size, "state_dict.pt") 22 | params_file = os.path.join(state_dict_dir, model_size, "params.json") 23 | tokenizer_path = os.path.join(state_dict_dir, "tokenizer.model") 24 | params = hiq.read_file(params_file, as_json=True) 25 | 26 | model_args = dict( 27 | max_seq_len=2048, 28 | max_batch_size=1, 29 | **params 30 | ) 31 | model_args.update(kwargs) 32 | model_args = ModelArgs(**model_args) 33 | 34 | self.tokenizer = Tokenizer(model_path=tokenizer_path) 35 | model_args.vocab_size = self.tokenizer.n_words 36 | 37 | with init_empty_weights(): 38 | torch.set_default_tensor_type(torch.HalfTensor) 39 | model = Transformer(model_args) 40 | torch.set_default_tensor_type(torch.FloatTensor) 41 | 42 | self.model = load_checkpoint_and_dispatch( 43 | model, 44 | state_dict, 45 | device_map=device_map, 46 | no_split_module_classes=["TransformerBlock"] 47 | ) 48 | 49 | self.generator = LLaMA(self.model, self.tokenizer) 50 | 51 | def generate(self, texts, temperature=0.8, top_p=0.95, max_length=256, stop_ids=None, stop_words=None): 52 | results = self.generator.generate( 53 | texts, 54 | max_gen_len=max_length, 55 | temperature=temperature, 56 | top_p=top_p, 57 | stop_ids=stop_ids, 58 | stop_words=stop_words 59 | ) 60 | return results 61 | 62 | def get_args(): 63 | import argparse 64 | 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--state_dict_dir", type=str, default="/llama_data/7B") 67 | parser.add_argument( 68 | "--model_size", 69 | choices=NUM_SHARDS.keys(), 70 | ) 71 | return parser.parse_args() 72 | 73 | if __name__ == "__main__": 74 | args = get_args() 75 | i = LLaMAInference(args.state_dict_dir, args.model_size) 76 | results = i.generate(["The meaning of life is"]) 77 | for result in results: 78 | print("🦙LLaMA:", result.strip()) 79 | 80 | 81 | results = i.generate(["Question: why apple drops from the tree when it is ripe?\nAnswer:"], 82 | stop_words=["Question"]) 83 | for result in results: 84 | print("🦙LLaMA:", result.strip()) 85 | 86 | 87 | -------------------------------------------------------------------------------- /llama/llama_quant.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from gptq import ( 7 | GPTQ, 8 | Quantizer, 9 | find_layers, 10 | make_quant, 11 | QuantLinear, 12 | get_loaders, 13 | quantize, 14 | ) 15 | 16 | from llama.hf import LLaMAForCausalLM, LLaMATokenizer, LLaMAConfig 17 | from llama.hf.utils import avoid_tensor_modified, get_llama 18 | 19 | 20 | @torch.no_grad() 21 | def llama_sequential(model, dataloader, args, dev): 22 | use_cache = model.config.use_cache 23 | model.config.use_cache = False 24 | layers = model.model.layers 25 | 26 | model.model.embed_tokens = model.model.embed_tokens.to(dev) 27 | model.model.norm = model.model.norm.to(dev) 28 | layers[0] = layers[0].to(dev) 29 | 30 | dtype = next(iter(model.parameters())).dtype 31 | inps = torch.zeros( 32 | (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev 33 | ) 34 | cache = {"i": 0, "attention_mask": None} 35 | 36 | class Catcher(nn.Module): 37 | def __init__(self, module): 38 | super().__init__() 39 | self.module = module 40 | 41 | def forward(self, inp, **kwargs): 42 | # print("kwargs:", kwargs.keys()) 43 | inps[cache["i"]] = inp 44 | cache["i"] += 1 45 | cache["attention_mask"] = kwargs["attention_mask"] 46 | raise ValueError 47 | 48 | layers[0] = Catcher(layers[0]) 49 | for batch in dataloader: 50 | try: 51 | i = batch[0].to(dev) 52 | model(i) 53 | except ValueError: 54 | pass 55 | layers[0] = layers[0].module 56 | 57 | layers[0] = layers[0].cpu() 58 | model.model.embed_tokens = model.model.embed_tokens.cpu() 59 | model.model.norm = model.model.norm.cpu() 60 | torch.cuda.empty_cache() 61 | 62 | outs = torch.zeros_like(inps) 63 | attention_mask = cache["attention_mask"] 64 | 65 | quantizers = {} 66 | for i in range(len(layers)): 67 | layer = layers[i].to(dev) 68 | subset = find_layers(layer) 69 | name_to_gptq = {} 70 | for name in subset: 71 | name_to_gptq[name] = GPTQ(subset[name]) 72 | name_to_gptq[name].quantizer = Quantizer() 73 | name_to_gptq[name].quantizer.configure( 74 | args.wbits, perchannel=True, sym=False, mse=False 75 | ) 76 | 77 | def add_batch(name): 78 | def tmp(_, inp, out): 79 | name_to_gptq[name].add_batch(inp[0].data, out.data) 80 | 81 | return tmp 82 | 83 | handles = [] 84 | for name in subset: 85 | handles.append(subset[name].register_forward_hook(add_batch(name))) 86 | for j in range(args.nsamples): 87 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 88 | for h in handles: 89 | h.remove() 90 | print(f"\nQuantize layer: {i} ", end=',') 91 | for name in subset: 92 | print(name, end=",") 93 | name_to_gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) 94 | quantizers["model.layers.%d.%s" % (i, name)] = name_to_gptq[name].quantizer 95 | name_to_gptq[name].free() 96 | for j in range(args.nsamples): 97 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 98 | 99 | layers[i] = layer.cpu() 100 | del layer 101 | del name_to_gptq 102 | torch.cuda.empty_cache() 103 | 104 | inps, outs = outs, inps 105 | 106 | model.config.use_cache = use_cache 107 | return quantizers 108 | 109 | 110 | @torch.no_grad() 111 | def llama_eval(model, testenc, args, dev): 112 | print("Evaluating ...") 113 | 114 | testenc = testenc.input_ids 115 | nsamples = testenc.numel() // model.seqlen 116 | 117 | use_cache = model.config.use_cache 118 | model.config.use_cache = False 119 | layers = model.model.layers 120 | 121 | model.model.embed_tokens = model.model.embed_tokens.to(dev) 122 | layers[0] = layers[0].to(dev) 123 | 124 | dtype = next(iter(model.parameters())).dtype 125 | inps = torch.zeros( 126 | (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev 127 | ) 128 | cache = {"i": 0, "attention_mask": None} 129 | 130 | class Catcher(nn.Module): 131 | def __init__(self, module): 132 | super().__init__() 133 | self.module = module 134 | 135 | def forward(self, inp, **kwargs): 136 | inps[cache["i"]] = inp 137 | cache["i"] += 1 138 | cache["attention_mask"] = kwargs["attention_mask"] 139 | raise ValueError 140 | 141 | layers[0] = Catcher(layers[0]) 142 | for i in range(nsamples): 143 | batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev) 144 | try: 145 | model(batch) 146 | except ValueError: 147 | pass 148 | layers[0] = layers[0].module 149 | 150 | layers[0] = layers[0].cpu() 151 | model.model.embed_tokens = model.model.embed_tokens.cpu() 152 | torch.cuda.empty_cache() 153 | 154 | outs = torch.zeros_like(inps) 155 | attention_mask = cache["attention_mask"] 156 | 157 | for i in range(len(layers)): 158 | print(i) 159 | layer = layers[i].to(dev) 160 | 161 | if args.nearest: 162 | subset = find_layers(layer) 163 | for name in subset: 164 | quantizer = Quantizer() 165 | quantizer.configure(args.wbits, perchannel=True, sym=False, mse=False) 166 | W = subset[name].weight.data 167 | quantizer.find_params(W, weight=True) 168 | subset[name].weight.data = quantize( 169 | W, quantizer.scale, quantizer.zero, quantizer.maxq 170 | ).to(next(iter(layer.parameters())).dtype) 171 | 172 | for j in range(nsamples): 173 | outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] 174 | layers[i] = layer.cpu() 175 | del layer 176 | torch.cuda.empty_cache() 177 | inps, outs = outs, inps 178 | 179 | if model.model.norm is not None: 180 | model.model.norm = model.model.norm.to(dev) 181 | model.lm_head = model.lm_head.to(dev) 182 | 183 | testenc = testenc.to(dev) 184 | nlls = [] 185 | for i in range(nsamples): 186 | hidden_states = inps[i].unsqueeze(0) 187 | if model.model.norm is not None: 188 | hidden_states = model.model.norm(hidden_states) 189 | lm_logits = model.lm_head(hidden_states) 190 | shift_logits = lm_logits[:, :-1, :].contiguous() 191 | shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] 192 | loss_fct = nn.CrossEntropyLoss() 193 | loss = loss_fct( 194 | shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) 195 | ) 196 | neg_log_likelihood = loss.float() * model.seqlen 197 | nlls.append(neg_log_likelihood) 198 | ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) 199 | print(ppl.item()) 200 | 201 | model.config.use_cache = use_cache 202 | 203 | 204 | # TODO: perform packing on GPU 205 | def llama_pack(model, quantizers, wbits): 206 | layers = find_layers(model) 207 | layers = {n: layers[n] for n in quantizers} 208 | make_quant(model, quantizers, wbits) 209 | qlayers = find_layers(model, [QuantLinear]) 210 | for name in qlayers: 211 | print(name) 212 | quantizers[name] = quantizers[name].cpu() 213 | qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) 214 | return model 215 | 216 | 217 | def load_quant(model_name, checkpoint, wbits, seqlen=1024, for_infer=True): 218 | """ 219 | seqlen - seqlen refers to the maximum length of the input sequence that the model can process. The input sequence can be a sequence of words, tokens, or characters, depending on how the model is tokenized. The seqlen parameter is important because it determines the amount of memory that the model requires to process the input sequence. If the input sequence is too long, it may exceed the memory capacity of the model, leading to out-of-memory errors or slower inference times. In order to handle longer sequences, some models use techniques such as attention masking or truncation, which allow the model to process only a portion of the input sequence at a time. The seqlen parameter determines the maximum length of the input sequence that can be processed in a single step. If the input sequence is longer than the seqlen parameter, it may need to be split into multiple segments and processed separately. 220 | """ 221 | import transformers 222 | 223 | config = LLaMAConfig.from_pretrained(model_name) 224 | avoid_tensor_modified() 225 | 226 | transformers.modeling_utils._init_weights = False 227 | torch.set_default_dtype(torch.half) 228 | model = LLaMAForCausalLM(config) 229 | torch.set_default_dtype(torch.float) 230 | if for_infer: 231 | model = model.eval() 232 | layers = find_layers(model) 233 | for name in ["lm_head"]: 234 | if name in layers: 235 | del layers[name] 236 | make_quant(model, layers, wbits) 237 | 238 | print(f"⌛️ Loading model from {checkpoint}...") 239 | model.load_state_dict(torch.load(checkpoint)) 240 | model.seqlen = seqlen 241 | print(f"✅ Model from {checkpoint} is loaded successfully.") 242 | 243 | return model 244 | 245 | 246 | def llama_multigpu(model, gpus): 247 | """A model parallelism implementation for LLaMA""" 248 | import math 249 | import copy 250 | 251 | model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) 252 | if hasattr(model.model, "norm") and model.model.norm: 253 | model.model.norm = model.model.norm.to(gpus[-1]) 254 | 255 | model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) 256 | 257 | cache = {"mask": None} 258 | 259 | class MoveModule(nn.Module): 260 | def __init__(self, module): 261 | super().__init__() 262 | self.module = module 263 | self.dev = next(iter(self.module.parameters())).device 264 | 265 | def forward(self, *inp, **kwargs): 266 | inp = list(inp) 267 | if inp[0].device != self.dev: 268 | inp[0] = inp[0].to(self.dev) 269 | if cache["mask"] is None or cache["mask"].device != self.dev: 270 | cache["mask"] = kwargs["attention_mask"].to(self.dev) 271 | kwargs["attention_mask"] = cache["mask"] 272 | tmp = self.module(*inp, **kwargs) 273 | return tmp 274 | 275 | layers = model.model.layers 276 | pergpu = math.ceil(len(layers) / len(gpus)) 277 | for i in range(len(layers)): 278 | layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) 279 | 280 | model.gpus = gpus 281 | 282 | 283 | def run_benchmark(model, input_ids, check=False, dev=torch.device("cuda:0")): 284 | input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else dev) 285 | torch.cuda.synchronize() 286 | 287 | cache = {"past": None} 288 | 289 | def clear_past(i): 290 | def tmp(layer, inp, out): 291 | if cache["past"]: 292 | cache["past"][i] = None 293 | 294 | return tmp 295 | 296 | for i, layer in enumerate(model.model.layers): 297 | layer.register_forward_hook(clear_past(i)) 298 | 299 | print("Benchmarking ...") 300 | 301 | if check: 302 | loss = nn.CrossEntropyLoss() 303 | tot = 0.0 304 | 305 | def sync(): 306 | if hasattr(model, "gpus"): 307 | for gpu in model.gpus: 308 | torch.cuda.synchronize(gpu) 309 | else: 310 | torch.cuda.synchronize() 311 | 312 | max_memory = 0 313 | with torch.no_grad(): 314 | attention_mask = torch.ones((1, input_ids.numel()), device=dev) 315 | times = [] 316 | for i in range(input_ids.numel()): 317 | tick = time.time() 318 | out = model( 319 | input_ids[:, i].reshape(-1), 320 | past_key_values=cache["past"], 321 | attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)), 322 | ) 323 | sync() 324 | times.append(time.time() - tick) 325 | print(i, times[-1]) 326 | max_memory = max(max_memory, torch.cuda.memory_allocated() / 1024 / 1024) 327 | if check and i != input_ids.numel() - 1: 328 | tot += loss( 329 | out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev) 330 | ).float() 331 | cache["past"] = list(out.past_key_values) 332 | del out 333 | sync() 334 | import numpy as np 335 | 336 | print("Median:", np.median(times)) 337 | if check: 338 | print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item()) 339 | print("max memory(MiB):", max_memory) 340 | 341 | 342 | def get_args(): 343 | import argparse 344 | 345 | parser = argparse.ArgumentParser() 346 | parser.add_argument( 347 | "model", 348 | type=str, 349 | help="llama model to load", 350 | default="decapoda-research/llama-7b-hf", 351 | ) 352 | parser.add_argument( 353 | "dataset", 354 | type=str, 355 | choices=["wikitext2", "ptb", "c4"], 356 | help="Where to extract calibration data from.", 357 | ) 358 | parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B") 359 | parser.add_argument( 360 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 361 | ) 362 | parser.add_argument( 363 | "--seed", type=int, default=0, help="Seed for sampling the calibration data." 364 | ) 365 | parser.add_argument( 366 | "--nsamples", type=int, default=128, help="Number of calibration data samples." 367 | ) 368 | parser.add_argument( 369 | "--percdamp", 370 | type=float, 371 | default=0.01, 372 | help="Percent of the average Hessian diagonal to use for dampening.", 373 | ) 374 | parser.add_argument( 375 | "--nearest", action="store_true", help="Whether to run the RTN baseline." 376 | ) 377 | parser.add_argument( 378 | "--wbits", 379 | type=int, 380 | default=16, 381 | choices=[2, 3, 4, 8, 16], 382 | help="#bits to use for quantization; use 16 for evaluating base model.", 383 | ) 384 | parser.add_argument( 385 | "--groupsize", 386 | type=int, 387 | default=-1, 388 | help="Groupsize to use for quantization; default uses full row.", 389 | ) 390 | parser.add_argument( 391 | "--save", 392 | type=str, 393 | default="", 394 | help="Save quantized checkpoint under this name, eg pyllama-7B4b.pt.", 395 | ) 396 | parser.add_argument("--load", type=str, default="", help="Load quantized model.") 397 | parser.add_argument( 398 | "--benchmark", 399 | type=int, 400 | default=0, 401 | help="Number of tokens to use for benchmarking.", 402 | ) 403 | parser.add_argument( 404 | "--check", 405 | action="store_true", 406 | help="Whether to compute perplexity during benchmarking for verification.", 407 | ) 408 | parser.add_argument( 409 | "--cuda", 410 | type=str, 411 | default="cuda:0", 412 | help="GPU device string, 'cuda:0' by default.", 413 | ) 414 | parser.add_argument( 415 | "--eval", 416 | action="store_false", 417 | help="Evaluate the model with dataset wikitext2, ptb and c4", 418 | ) 419 | 420 | args = parser.parse_args() 421 | return args 422 | 423 | 424 | def run(args=None): 425 | args = args or get_args() 426 | if args.load: 427 | model = load_quant(args.model, args.load, args.wbits) 428 | else: 429 | model = get_llama(args.model) 430 | model.eval() 431 | if args.cuda.startswith("cuda"): 432 | dev = torch.device(args.cuda) 433 | else: 434 | dev = torch.device("cpu") 435 | 436 | tokenizer = LLaMATokenizer.from_pretrained( 437 | args.model, add_eos_token=True 438 | ) 439 | dataloader, testloader = get_loaders( 440 | args.dataset, 441 | nsamples=args.nsamples, 442 | seed=args.seed, 443 | model=args.model, 444 | seqlen=model.seqlen, 445 | tokenizer=tokenizer 446 | ) 447 | 448 | if not args.load and args.wbits < 16 and not args.nearest: 449 | quantizers = llama_sequential(model, dataloader, args, dev) 450 | 451 | if args.benchmark: 452 | gpus = [torch.device("cuda:%d" % i) for i in range(torch.cuda.device_count())] 453 | if len(gpus) > 1: 454 | llama_multigpu(model, gpus) 455 | else: 456 | model = model.to(dev) 457 | if args.benchmark: 458 | input_ids = next(iter(dataloader))[0][:, : args.benchmark] 459 | run_benchmark(model, input_ids, check=args.check) 460 | if args.load: 461 | exit() 462 | 463 | if args.save: 464 | llama_pack(model, quantizers, args.wbits) 465 | torch.save(model.state_dict(), args.save) 466 | 467 | if args.eval: 468 | for dataset in ["wikitext2", "ptb", "c4"]: 469 | dataloader, testloader = get_loaders( 470 | dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, tokenizer=tokenizer 471 | ) 472 | print(dataset) 473 | llama_eval(model, testloader, args, dev) 474 | 475 | 476 | if __name__ == "__main__": 477 | run() 478 | -------------------------------------------------------------------------------- /llama/model_parallel.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from typing import Optional, Tuple 5 | from dataclasses import dataclass 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | import torch.nn.functional as F 11 | 12 | import fairscale.nn.model_parallel.initialize as fs_init 13 | from fairscale.nn.model_parallel.layers import ( 14 | ParallelEmbedding, 15 | RowParallelLinear, 16 | ColumnParallelLinear, 17 | ) 18 | 19 | 20 | @dataclass 21 | class ModelArgs: 22 | dim: int = 512 23 | n_layers: int = 8 24 | n_heads: int = 8 25 | vocab_size: int = -1 # defined later by tokenizer 26 | multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 27 | norm_eps: float = 1e-5 28 | 29 | max_batch_size: int = 32 30 | max_seq_len: int = 2048 31 | 32 | 33 | class RMSNorm(torch.nn.Module): 34 | def __init__(self, dim: int, eps: float = 1e-6): 35 | super().__init__() 36 | self.eps = eps 37 | self.weight = nn.Parameter(torch.ones(dim)) 38 | 39 | def _norm(self, x): 40 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 41 | 42 | def forward(self, x): 43 | output = self._norm(x.float()).type_as(x) 44 | return output * self.weight 45 | 46 | 47 | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): 48 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 49 | t = torch.arange(end, device=freqs.device) # type: ignore 50 | freqs = torch.outer(t, freqs).float() # type: ignore 51 | freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 52 | return freqs_cis 53 | 54 | 55 | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): 56 | ndim = x.ndim 57 | assert 0 <= 1 < ndim 58 | assert freqs_cis.shape == (x.shape[1], x.shape[-1]) 59 | shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] 60 | return freqs_cis.view(*shape) 61 | 62 | 63 | def apply_rotary_emb( 64 | xq: torch.Tensor, 65 | xk: torch.Tensor, 66 | freqs_cis: torch.Tensor, 67 | ) -> Tuple[torch.Tensor, torch.Tensor]: 68 | xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) 69 | xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) 70 | freqs_cis = reshape_for_broadcast(freqs_cis, xq_) 71 | xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) 72 | xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) 73 | return xq_out.type_as(xq), xk_out.type_as(xk) 74 | 75 | 76 | class Attention(nn.Module): 77 | def __init__(self, args: ModelArgs): 78 | super().__init__() 79 | 80 | self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size() 81 | self.head_dim = args.dim // args.n_heads 82 | 83 | self.wq = ColumnParallelLinear( 84 | args.dim, 85 | args.n_heads * self.head_dim, 86 | bias=False, 87 | gather_output=False, 88 | init_method=lambda x: x, 89 | ) 90 | self.wk = ColumnParallelLinear( 91 | args.dim, 92 | args.n_heads * self.head_dim, 93 | bias=False, 94 | gather_output=False, 95 | init_method=lambda x: x, 96 | ) 97 | self.wv = ColumnParallelLinear( 98 | args.dim, 99 | args.n_heads * self.head_dim, 100 | bias=False, 101 | gather_output=False, 102 | init_method=lambda x: x, 103 | ) 104 | self.wo = RowParallelLinear( 105 | args.n_heads * self.head_dim, 106 | args.dim, 107 | bias=False, 108 | input_is_parallel=True, 109 | init_method=lambda x: x, 110 | ) 111 | 112 | self.cache_k = torch.zeros( 113 | (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) 114 | ).cuda() 115 | self.cache_v = torch.zeros( 116 | (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) 117 | ).cuda() 118 | 119 | def forward( 120 | self, 121 | x: torch.Tensor, 122 | start_pos: int, 123 | freqs_cis: torch.Tensor, 124 | mask: Optional[torch.Tensor], 125 | ): 126 | bsz, seqlen, _ = x.shape 127 | xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) 128 | 129 | xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) 130 | xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) 131 | xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) 132 | 133 | xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) 134 | 135 | self.cache_k = self.cache_k.to(xq) 136 | self.cache_v = self.cache_v.to(xq) 137 | 138 | self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk 139 | self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv 140 | 141 | keys = self.cache_k[:bsz, : start_pos + seqlen] 142 | values = self.cache_v[:bsz, : start_pos + seqlen] 143 | 144 | xq = xq.transpose(1, 2) 145 | keys = keys.transpose(1, 2) 146 | values = values.transpose(1, 2) 147 | scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) 148 | if mask is not None: 149 | scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) 150 | scores = F.softmax(scores.float(), dim=-1).type_as(xq) 151 | output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) 152 | output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) 153 | 154 | return self.wo(output) 155 | 156 | 157 | class FeedForward(nn.Module): 158 | def __init__( 159 | self, 160 | dim: int, 161 | hidden_dim: int, 162 | multiple_of: int, 163 | ): 164 | super().__init__() 165 | hidden_dim = int(2 * hidden_dim / 3) 166 | hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) 167 | 168 | self.w1 = ColumnParallelLinear( 169 | dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x 170 | ) 171 | self.w2 = RowParallelLinear( 172 | hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x 173 | ) 174 | self.w3 = ColumnParallelLinear( 175 | dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x 176 | ) 177 | 178 | def forward(self, x): 179 | return self.w2(F.silu(self.w1(x)) * self.w3(x)) 180 | 181 | 182 | class TransformerBlock(nn.Module): 183 | def __init__(self, layer_id: int, args: ModelArgs): 184 | super().__init__() 185 | self.n_heads = args.n_heads 186 | self.dim = args.dim 187 | self.head_dim = args.dim // args.n_heads 188 | self.attention = Attention(args) 189 | self.feed_forward = FeedForward( 190 | dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of 191 | ) 192 | self.layer_id = layer_id 193 | self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) 194 | self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) 195 | 196 | def forward( 197 | self, 198 | x: torch.Tensor, 199 | start_pos: int, 200 | freqs_cis: torch.Tensor, 201 | mask: Optional[torch.Tensor], 202 | ): 203 | h = x + self.attention.forward( 204 | self.attention_norm(x), start_pos, freqs_cis, mask 205 | ) 206 | out = h + self.feed_forward.forward(self.ffn_norm(h)) 207 | return out 208 | 209 | 210 | class Transformer(nn.Module): 211 | def __init__(self, params: ModelArgs): 212 | super().__init__() 213 | self.params = params 214 | self.vocab_size = params.vocab_size 215 | self.n_layers = params.n_layers 216 | 217 | self.tok_embeddings = ParallelEmbedding( 218 | params.vocab_size, params.dim, init_method=lambda x: x 219 | ) 220 | 221 | self.layers = torch.nn.ModuleList() 222 | for layer_id in range(params.n_layers): 223 | self.layers.append(TransformerBlock(layer_id, params)) 224 | 225 | self.norm = RMSNorm(params.dim, eps=params.norm_eps) 226 | self.output = ColumnParallelLinear( 227 | params.dim, params.vocab_size, bias=False, init_method=lambda x: x 228 | ) 229 | 230 | self.freqs_cis = precompute_freqs_cis( 231 | self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 232 | ) 233 | 234 | @torch.inference_mode() 235 | def forward(self, tokens: torch.Tensor, start_pos: int): 236 | _bsz, seqlen = tokens.shape 237 | h = self.tok_embeddings(tokens) 238 | self.freqs_cis = self.freqs_cis.to(h.device) 239 | freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] 240 | 241 | mask = None 242 | if seqlen > 1: 243 | mask = torch.full( 244 | (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device 245 | ) 246 | mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) 247 | 248 | for layer in self.layers: 249 | h = layer(h, start_pos, freqs_cis, mask) 250 | h = self.norm(h) 251 | output = self.output(h[:, -1, :]) # only compute last logits 252 | return output.float() 253 | -------------------------------------------------------------------------------- /llama/model_single.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | from dataclasses import dataclass 3 | import math 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import hiq 8 | 9 | 10 | @dataclass 11 | class ModelArgs: 12 | dim: int = 512 13 | n_layers: int = 8 14 | n_heads: int = 8 15 | vocab_size: int = -1 # defined later by tokenizer 16 | multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 17 | norm_eps: float = 1e-5 18 | 19 | max_batch_size: int = 1 20 | max_seq_len: int = 2048 21 | 22 | 23 | class RMSNorm(torch.nn.Module): 24 | def __init__(self, dim: int, eps: float = 1e-6): 25 | super().__init__() 26 | self.eps = eps 27 | self.weight = nn.Parameter(torch.ones(dim)) 28 | 29 | def _norm(self, x): 30 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 31 | 32 | def forward(self, x): 33 | output = self._norm(x.float()).type_as(x) 34 | return output * self.weight 35 | 36 | 37 | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): 38 | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) 39 | t = torch.arange(end, device=freqs.device) # type: ignore 40 | freqs = torch.outer(t, freqs).float() # type: ignore 41 | freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 42 | return freqs_cis 43 | 44 | 45 | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): 46 | ndim = x.ndim 47 | assert 0 <= 1 < ndim 48 | assert freqs_cis.shape == (x.shape[1], x.shape[-1]) 49 | shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] 50 | return freqs_cis.view(*shape) 51 | 52 | 53 | def apply_rotary_emb( 54 | xq: torch.Tensor, 55 | xk: torch.Tensor, 56 | freqs_cis: torch.Tensor, 57 | ) -> Tuple[torch.Tensor, torch.Tensor]: 58 | xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) 59 | xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) 60 | freqs_cis = reshape_for_broadcast(freqs_cis, xq_) 61 | xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) 62 | xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) 63 | return xq_out.type_as(xq), xk_out.type_as(xk) 64 | 65 | 66 | class Attention(nn.Module): 67 | def __init__(self, args: ModelArgs): 68 | super().__init__() 69 | 70 | self.n_local_heads = args.n_heads // 1 71 | self.head_dim = args.dim // args.n_heads 72 | 73 | self.wq = nn.Linear( 74 | args.dim, 75 | args.n_heads * self.head_dim, 76 | bias=False, 77 | ) 78 | self.wk = nn.Linear( 79 | args.dim, 80 | args.n_heads * self.head_dim, 81 | bias=False, 82 | ) 83 | self.wv = nn.Linear( 84 | args.dim, 85 | args.n_heads * self.head_dim, 86 | bias=False, 87 | ) 88 | self.wo = nn.Linear( 89 | args.n_heads * self.head_dim, 90 | args.dim, 91 | bias=False, 92 | ) 93 | self.cache_k = torch.zeros( 94 | (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) 95 | ) 96 | self.cache_v = torch.zeros( 97 | (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) 98 | ) 99 | if hiq.get_env_bool("KV_CAHCHE_IN_GPU", True): 100 | self.cache_k = self.cache_k.cuda() 101 | self.cache_v = self.cache_v.cuda() 102 | 103 | def forward( 104 | self, 105 | x: torch.Tensor, 106 | start_pos: int, 107 | freqs_cis: torch.Tensor, 108 | mask: Optional[torch.Tensor], 109 | ): 110 | bsz, seqlen, _ = x.shape 111 | xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) 112 | 113 | xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) 114 | xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) 115 | xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) 116 | 117 | xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) 118 | 119 | self.cache_k = self.cache_k.to(xq) 120 | self.cache_v = self.cache_v.to(xq) 121 | 122 | self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk 123 | self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv 124 | 125 | keys = self.cache_k[:bsz, : start_pos + seqlen] 126 | values = self.cache_v[:bsz, : start_pos + seqlen] 127 | 128 | xq = xq.transpose(1, 2) 129 | keys = keys.transpose(1, 2) 130 | values = values.transpose(1, 2) 131 | scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) 132 | if mask is not None: 133 | scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) 134 | scores = F.softmax(scores.float(), dim=-1).type_as(xq) 135 | output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) 136 | output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) 137 | 138 | return self.wo(output) 139 | 140 | 141 | class FeedForward(nn.Module): 142 | def __init__( 143 | self, 144 | dim: int, 145 | hidden_dim: int, 146 | multiple_of: int, 147 | ): 148 | super().__init__() 149 | hidden_dim = int(2 * hidden_dim / 3) 150 | hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) 151 | 152 | self.w1 = nn.Linear(dim, hidden_dim, bias=False) 153 | self.w2 = nn.Linear(hidden_dim, dim, bias=False) 154 | self.w3 = nn.Linear(dim, hidden_dim, bias=False) 155 | 156 | def forward(self, x): 157 | return self.w2(F.silu(self.w1(x)) * self.w3(x)) 158 | 159 | 160 | class TransformerBlock(nn.Module): 161 | def __init__(self, layer_id: int, args: ModelArgs): 162 | super().__init__() 163 | self.n_heads = args.n_heads 164 | self.dim = args.dim 165 | self.head_dim = args.dim // args.n_heads 166 | self.attention = Attention(args) 167 | self.feed_forward = FeedForward( 168 | dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of 169 | ) 170 | self.layer_id = layer_id 171 | self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) 172 | self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) 173 | 174 | def forward( 175 | self, 176 | x: torch.Tensor, 177 | start_pos: int, 178 | freqs_cis: torch.Tensor, 179 | mask: Optional[torch.Tensor], 180 | ): 181 | h = x + self.attention.forward( 182 | self.attention_norm(x), start_pos, freqs_cis, mask 183 | ) 184 | out = h + self.feed_forward.forward(self.ffn_norm(h)) 185 | return out 186 | 187 | 188 | class Transformer(nn.Module): 189 | def __init__(self, params: ModelArgs): 190 | super().__init__() 191 | self.params = params 192 | self.vocab_size = params.vocab_size 193 | self.n_layers = params.n_layers 194 | 195 | self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) 196 | 197 | self.layers = torch.nn.ModuleList() 198 | for layer_id in range(params.n_layers): 199 | self.layers.append(TransformerBlock(layer_id, params)) 200 | 201 | self.norm = RMSNorm(params.dim, eps=params.norm_eps) 202 | self.output = nn.Linear(params.dim, params.vocab_size, bias=False) 203 | 204 | self.freqs_cis = precompute_freqs_cis( 205 | self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 206 | ) 207 | 208 | @torch.inference_mode() 209 | def forward(self, tokens: torch.Tensor, start_pos: int): 210 | _bsz, seqlen = tokens.shape 211 | h = self.tok_embeddings(tokens) 212 | self.freqs_cis = self.freqs_cis.to(h.device) 213 | freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] 214 | 215 | mask = None 216 | if seqlen > 1: 217 | mask = torch.full( 218 | (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device 219 | ) 220 | mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) 221 | 222 | for layer in self.layers: 223 | h = layer(h, start_pos, freqs_cis, mask) 224 | h = self.norm(h) 225 | output = self.output(h[:, -1, :]) # only compute last logits 226 | return output.float() 227 | -------------------------------------------------------------------------------- /llama/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from sentencepiece import SentencePieceProcessor 5 | from logging import getLogger 6 | from typing import * 7 | import os 8 | 9 | from transformers.tokenization_utils import PreTrainedTokenizer 10 | 11 | logger = getLogger() 12 | 13 | 14 | class Tokenizer: 15 | def __init__(self, model_path: str): 16 | # reload tokenizer 17 | assert os.path.isfile(model_path), model_path 18 | self.sp_model = SentencePieceProcessor(model_file=model_path) 19 | #print(f"loaded SentencePiece model from {model_path}") 20 | 21 | # BOS / EOS token IDs 22 | self.n_words: int = self.sp_model.vocab_size() 23 | self.bos_id: int = self.sp_model.bos_id() 24 | self.eos_id: int = self.sp_model.eos_id() 25 | self.pad_id: int = self.sp_model.pad_id() 26 | #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}") 27 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 28 | 29 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]: 30 | assert type(s) is str 31 | t = self.sp_model.encode(s) 32 | if bos: 33 | t = [self.bos_id] + t 34 | if eos: 35 | t = t + [self.eos_id] 36 | return t 37 | 38 | def decode(self, t: List[int]) -> str: 39 | return self.sp_model.decode(t) 40 | 41 | 42 | if __name__ == "__main__": 43 | def get_args(): 44 | import argparse 45 | 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument( 48 | "--tokenizer_path", type=str, default="/llama_data/tokenizer.model" 49 | ) 50 | return parser.parse_args() 51 | 52 | t = Tokenizer(model_path=get_args().tokenizer_path) 53 | print(t.encode("hello world", False, False)) 54 | -------------------------------------------------------------------------------- /llama/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.8" 2 | -------------------------------------------------------------------------------- /quant_infer.py: -------------------------------------------------------------------------------- 1 | import hiq, time 2 | from hiq.memory import total_gpu_memory_mb, get_memory_mb 3 | import platform 4 | 5 | 6 | def main(): 7 | 8 | try: 9 | wsl = 'microsoft' in platform.uname()[2].lower() 10 | except: 11 | wsl = False 12 | 13 | driver = hiq.HiQLatency( 14 | hiq_table_or_path=[ 15 | ["llama.llama_infer", "", "run", "run_quant"], 16 | ["llama.llama_infer", "LLaMATokenizer", "from_pretrained", "from_pretrained"], 17 | ["llama.hf", "LLaMATokenizer", "encode", "encode"], 18 | ["llama.llama_infer", "", "load_quant", "load_quant"], 19 | ["llama.hf.modeling_llama", "LLaMAForCausalLM", "generate", "generate"] 20 | ], 21 | metric_funcs=[time.time, get_memory_mb] + ([total_gpu_memory_mb] if not wsl else []), # WSL does not contain nvidia-smi 22 | # extra_metrics={hiq.ExtraMetrics.ARGS}, 23 | ) 24 | 25 | args = hiq.mod("llama.llama_infer").get_args() 26 | hiq.mod("llama.llama_infer").run(args) 27 | print("*" * 30, ("GPU/" if not wsl else "") + "CPU/Latency Profiling", "*" * 30) 28 | if wsl: 29 | print('(WSL does not contain nvidia-smi, GPU profiling is disabled)') 30 | driver.show() 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /requirements-quant.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.26.0 2 | gptq>=0.0.2 3 | sentencepiece>=0.1.97 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | fairscale>=0.4.13 3 | fire~=0.5.0 4 | hiq-python>=1.1.9 5 | sentencepiece==0.1.97 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from setuptools import setup, find_packages 5 | import os 6 | 7 | here = os.path.dirname(os.path.realpath(__file__)) 8 | 9 | 10 | def _get_version(): 11 | with open(os.path.join(here, "llama", "version.py")) as f: 12 | try: 13 | version_line = next(line for line in f if line.startswith("__version__")) 14 | except StopIteration: 15 | raise ValueError("__version__ not defined in itree/version.py") 16 | else: 17 | ns = {} 18 | exec(version_line, ns) # pylint: disable=exec-used 19 | return ns["__version__"] 20 | 21 | 22 | VERSION = _get_version() 23 | 24 | 25 | def read_file(filename: str): 26 | try: 27 | lines = [] 28 | with open(filename) as file: 29 | lines = file.readlines() 30 | lines = [line.rstrip() for line in lines if not line.startswith("#")] 31 | return lines 32 | except: 33 | return [] 34 | 35 | 36 | DESCRIPTION = "🦙 LLaMA: Open and Efficient Foundation Language Models in A Single GPU" 37 | 38 | r_quant = read_file(f"{here}/requirements-quant.txt") 39 | r_basic = read_file(f"{here}/requirements.txt") 40 | 41 | 42 | def package_files(ds): 43 | paths = [] 44 | for d in ds: 45 | for path, directories, filenames in os.walk(d): 46 | for filename in filenames: 47 | if "__pycache__" not in str(filename) and not filename.endswith('.pyc'): 48 | paths.append(str(os.path.join(path, filename))[len("llama/") :]) 49 | return paths 50 | 51 | 52 | extra_files = package_files(["llama/"]) 53 | 54 | setup( 55 | name="pyllama", 56 | version=VERSION, 57 | author="Juncong Moo;Meta AI", 58 | author_email="JuncongMoo@gmail.com", 59 | description=DESCRIPTION, 60 | long_description=open("README.md", "r", encoding="utf-8").read(), 61 | long_description_content_type="text/markdown", 62 | install_requires=r_basic, 63 | package_data={"llama": extra_files}, 64 | include_package_data=True, 65 | keywords=[ 66 | "LLaMA", 67 | ], 68 | classifiers=[ 69 | "Programming Language :: Python :: 3", 70 | "Programming Language :: Python :: 3.6", 71 | "Programming Language :: Python :: 3.7", 72 | "Programming Language :: Python :: 3.8", 73 | "Programming Language :: Python :: 3.9", 74 | "Programming Language :: Python :: 3.10", 75 | "Programming Language :: Python :: 3.11", 76 | ], 77 | url="https://github.com/juncongmoo/pyllama", 78 | packages=["llama"], 79 | extras_require={ 80 | "quant": r_quant, 81 | "full": r_quant + r_basic, 82 | }, 83 | ) 84 | --------------------------------------------------------------------------------