├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── MODEL_CARD.md
├── README.md
├── apps
    ├── flask
    │   ├── requirements.txt
    │   ├── web_server.py
    │   └── web_server_single.py
    └── gradio
    │   ├── run.sh
    │   ├── set_up_venv.sh
    │   ├── webapp.py
    │   └── webapp_single.py
├── dataset
    └── alpaca_data.json
├── docs
    ├── README.md
    ├── download.png
    ├── llama_hf.md
    ├── llama_inference.png
    ├── llama_multigpu.png
    ├── llama_profiling.png
    ├── llama_webui.png
    ├── pyllama_7B_3GB.png
    └── pyllama_7B_6GB.png
├── download.sh
├── example.py
├── inference.py
├── inference_driver.py
├── llama
    ├── __init__.py
    ├── convert_llama.py
    ├── download.py
    ├── download_community.sh
    ├── download_community_stop.sh
    ├── generation.py
    ├── hf
    │   ├── __init__.py
    │   ├── configuration_llama.py
    │   ├── modeling_llama.py
    │   ├── tokenization_llama.py
    │   └── utils.py
    ├── llama_infer.py
    ├── llama_multigpu.py
    ├── llama_quant.py
    ├── model_parallel.py
    ├── model_single.py
    ├── tokenizer.py
    └── version.py
├── quant_infer.py
├── requirements-quant.txt
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@meta.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to LLaMA
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to LLaMA, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements*.txt
2 | include README.md
3 | include llama/download_community.sh
4 | 


--------------------------------------------------------------------------------
/MODEL_CARD.md:
--------------------------------------------------------------------------------
  1 | # LLama Model Card
  2 | 
  3 | ## Model details
  4 | **Organization developing the model**
  5 | The FAIR team of Meta AI.
  6 | 
  7 | **Model date**
  8 | LLaMA was trained between December. 2022 and Feb. 2023.
  9 | 
 10 | **Model version**
 11 | This is version 1 of the model.
 12 | 
 13 | **Model type**
 14 | LLaMA is an auto-regressive language model, based on the transformer architecture. The model comes in different sizes: 7B, 13B, 33B and 65B parameters.
 15 | 
 16 | **Paper or resources for more information**
 17 | More information can be found in the paper “LLaMA, Open and Efficient Foundation Language Models”, available at https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/.
 18 | 
 19 | **Citations details**
 20 | https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/
 21 | 
 22 | **License**
 23 | Non-commercial bespoke license
 24 | 
 25 | **Where to send questions or comments about the model**
 26 | Questions and comments about LLaMA can be sent via the [GitHub repository](https://github.com/facebookresearch/llama) of the project , by opening an issue.
 27 | 
 28 | ## Intended use
 29 | **Primary intended uses**
 30 | The primary use of LLaMA is research on large language models, including:
 31 | exploring potential applications such as question answering, natural language understanding or reading comprehension,
 32 | understanding capabilities and limitations of current language models, and developing techniques to improve those,
 33 | evaluating and mitigating biases, risks, toxic and harmful content generations, hallucinations.
 34 | 
 35 | **Primary intended users**
 36 | The primary intended users of the model are researchers in natural language processing, machine learning and artificial intelligence.
 37 | 
 38 | **Out-of-scope use cases**
 39 | LLaMA is a base, or foundational, model. As such, it should not be used on downstream applications without further risk evaluation and mitigation. In particular, our model has not been trained with human feedback, and can thus generate toxic or offensive content, incorrect information or generally unhelpful answers.
 40 | 
 41 | ## Factors
 42 | **Relevant factors**
 43 | One of the most relevant factors for which model performance may vary is which language is used. Although we included 20 languages in the training data, most of our dataset is made of English text, and we thus expect the model to perform better for English than other languages. Relatedly, it has been shown in previous studies that performance might vary for different dialects, and we expect that it will be the case for our model.
 44 | 
 45 | **Evaluation factors**
 46 | As our model is trained on data from the Web, we expect that it reflects biases from this source. We thus evaluated on RAI datasets to measure biases exhibited by the model for gender, religion, race, sexual orientation, age, nationality, disability, physical appearance and socio-economic status. We also measure the toxicity of model generations, depending on the toxicity of the context used to prompt the model.
 47 | 
 48 | ## Metrics
 49 | **Model performance measures**
 50 | We use the following measure to evaluate the model:
 51 | - Accuracy for common sense reasoning, reading comprehension, natural language understanding (MMLU), BIG-bench hard, WinoGender and CrowS-Pairs,
 52 | - Exact match for question answering,
 53 | - The toxicity score from Perspective API on RealToxicityPrompts.
 54 | 
 55 | **Decision thresholds**
 56 | Not applicable.
 57 | 
 58 | **Approaches to uncertainty and variability**
 59 | Due to the high computational requirements of training LLMs, we trained only one model of each size, and thus could not evaluate variability of pre-training.
 60 | 
 61 | ## Evaluation datasets
 62 | The model was evaluated on the following benchmarks: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC, OpenBookQA, NaturalQuestions, TriviaQA, RACE, MMLU, BIG-bench hard, GSM8k, RealToxicityPrompts, WinoGender, CrowS-Pairs.
 63 | 
 64 | ## Training dataset
 65 | The model was trained using the following source of data: CCNet [67%], C4 [15%], GitHub [4.5%], Wikipedia [4.5%], Books [4.5%], ArXiv [2.5%], Stack Exchange[2%]. The Wikipedia and Books domains include data in the following languages: bg, ca, cs, da, de, en, es, fr, hr, hu, it, nl, pl, pt, ro, ru, sl, sr, sv, uk. See the paper for more details about the training set and corresponding preprocessing.
 66 | 
 67 | ## Quantitative analysis
 68 | Hyperparameters for the model architecture
 69 | 
 70 | 
 71 | <table>
 72 |     <thead>
 73 |             <tr>
 74 |             <th >LLaMa</th> <th colspan=6>Model hyper parameters </th>
 75 |             </tr>
 76 |             <tr>
 77 |             <th>Number of parameters</th><th>dimension</th><th>n heads</th><th>n layers</th><th>Learn rate</th><th>Batch size</th><th>n tokens</th>
 78 |             </tr>           
 79 |         </thead>
 80 |     <tbody>       
 81 |         <tr>
 82 |             <th>7B</th> <th>4096</th> <th>32</th> <th>32</th> <th>3.0E-04</th><th>4M</th><th>1T 
 83 |         </tr>
 84 |         <tr>
 85 |             <th>13B</th><th>5120</th><th>40</th><th>40</th><th>3.0E-04</th><th>4M</th><th>1T
 86 |         </tr>
 87 |         <tr>
 88 |             <th>33B</th><th>6656</th><th>52</th><th>60</th><th>1.5.E-04</th><th>4M</th><th>1.4T
 89 |         </tr>
 90 |         <tr>
 91 |             <th>65B</th><th>8192</th><th>64</th><th>80</th><th>1.5.E-04</th><th>4M</th><th>1.4T
 92 |         </tr>     
 93 |     </tbody>
 94 | </table>
 95 | 
 96 | 
 97 | *Table 1 - Summary of LLama Model Hyperparameters*
 98 | 
 99 | We present our results on eight standard common sense reasoning benchmarks in the table below. 
100 | <table>
101 |     <thead>
102 |             <tr>
103 |             <th>LLaMa</th>  <th colspan=9>Reasoning tasks </th>
104 |             </tr>
105 |             <tr>
106 |             <th>Number of parameters</th> <th>BoolQ</th><th>PIQA</th><th>SIQA</th><th>HellaSwag</th><th>WinoGrande</th><th>ARC-e</th><th>ARC-c</th><th>OBQA</th><th>COPA</th>
107 |             </tr>           
108 |         </thead>
109 |     <tbody>    
110 |     <tr>   
111 |         <th>7B</th><th>76.5</th><th>79.8</th><th>48.9</th><th>76.1</th><th>70.1</th><th>76.7</th><th>47.6</th><th>57.2</th><th>93
112 |         </th>   
113 |     <tr><th>13B</th><th>78.1</th><th>80.1</th><th>50.4</th><th>79.2</th><th>73</th><th>78.1</th><th>52.7</th><th>56.4</th><th>94
114 | </th>
115 |     <tr><th>33B</th><th>83.1</th><th>82.3</th><th>50.4</th><th>82.8</th><th>76</th><th>81.4</th><th>57.8</th><th>58.6</th><th>92
116 | </th>
117 |     <tr><th>65B</th><th>85.3</th><th>82.8</th><th>52.3</th><th>84.2</th><th>77</th><th>81.5</th><th>56</th><th>60.2</th><th>94</th></tr> 
118 |     </tbody>
119 | </table>
120 | 
121 | *Table 2 - Summary of LLama Model Performance on Reasoning tasks*
122 | 
123 | 
124 | We present our results on bias in the table below. Note that lower value is better indicating lower bias. 
125 | 
126 | 
127 | | No  | Category             | FAIR LLM |
128 | | --- | -------------------- | -------- |
129 | | 1   | Gender               | 70.6     |
130 | | 2   | Religion             | 79       |
131 | | 3   | Race/Color           | 57       |
132 | | 4   | Sexual orientation   | 81       |
133 | | 5   | Age                  | 70.1     |
134 | | 6   | Nationality          | 64.2     |
135 | | 7   | Disability           | 66.7     |
136 | | 8   | Physical appearance  | 77.8     |
137 | | 9   | Socioeconomic status | 71.5     |
138 | |     | LLaMA Average        | 66.6     |
139 | 
140 | *Table 3 - Summary bias of our model output*
141 | 
142 | 
143 | 
144 | ## Ethical considerations
145 | **Data**
146 | The data used to train the model is collected from various sources, mostly from the Web. As such, it contains offensive, harmful and biased content. We thus expect the model to exhibit such biases from the training data.
147 | 
148 | **Human life**
149 | The model is not intended to inform decisions about matters central to human life, and should not be used in such a way.
150 | 
151 | **Mitigations**
152 | We filtered the data from the Web based on its proximity to Wikipedia text and references. For this, we used a Kneser-Ney language model and a fastText linear classifier.
153 | 
154 | **Risks and harms**
155 | Risks and harms of large language models include the generation of harmful, offensive or biased content. These models are often prone to generating incorrect information, sometimes referred to as hallucinations. We do not expect our model to be an exception in this regard.
156 | 
157 | **Use cases**
158 | LLaMA is a foundational model, and as such, it should not be used for downstream applications without further investigation and mitigations of risks. These risks and potential fraught use cases include, but are not limited to: generation of misinformation and generation of harmful, biased or offensive content.
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🦙 LLaMA - Run LLM in A Single 4GB GPU
  2 | 
  3 | 
  4 | > 📢 `pyllama` is a hacked version of `LLaMA` based on original Facebook's implementation but more convenient to run in a Single consumer grade GPU.
  5 | 
  6 | > The Hugging Face's LLaMA implementation is available at `pyllama.hf`.
  7 | 
  8 | ## 📥 Installation
  9 | 
 10 | In a conda env with pytorch / cuda available, run:
 11 | ```
 12 | pip install pyllama -U
 13 | ```
 14 | 
 15 | > 🐏 If you have installed llama library from other sources, please uninstall the previous llama library and use `pip install pyllama -U` to install the latest version.
 16 | 
 17 | 
 18 | ## 📦 Download Model Files
 19 | 
 20 | ### 🧘‍♀️ Official Way
 21 | 
 22 | In order to download the checkpoints and tokenizer, fill this [google form](https://forms.gle/jk851eBVbX1m5TAv5)
 23 | 
 24 | Once your request is approved, you will receive links to download the tokenizer and model files.
 25 | Edit the `download.sh` script with the signed url provided in the email to download the model weights and tokenizer.
 26 | 
 27 | ### 🐒 Community Way
 28 | 
 29 | - 1. pyllama
 30 | 
 31 | There is another high-speed way to download the checkpoints and tokenizers. There are four models(7B,13B,30B,65B) available. To download all of them, run:
 32 | 
 33 | ```bash
 34 | python -m llama.download
 35 | ```
 36 | 
 37 | To download only the 7B model files to your current directory, run:
 38 | 
 39 | ```bash
 40 | python -m llama.download --model_size 7B
 41 | ```
 42 | 
 43 | To download only the 7B and 30B model files to folder `/tmp/pyllama_data`, run:
 44 | 
 45 | ```bash
 46 | python -m llama.download --model_size 7B,30B --folder /tmp/pyllama_data
 47 | ```
 48 | 
 49 | The help doc is:
 50 | ```bash
 51 | $python -m llama.download --help
 52 | usage: download.py [-h] [--model_size MODEL_SIZE] [--folder FOLDER]
 53 | 
 54 | optional arguments:
 55 |   -h, --help            show this help message and exit
 56 |   --model_size MODEL_SIZE
 57 |                         The size of the models that you want to download. A comma separated
 58 |                         string of any of "7B", "13B", "30B", "65B". Totally 219G disk space
 59 |                         is needed to download them all. If you only want to download the 7B
 60 |                         model, just put "7B" here.
 61 |   --folder FOLDER       The target folder for the download files
 62 | ```
 63 | 
 64 | - Sample Screenshot
 65 | 
 66 | ![](docs/download.png)
 67 | 
 68 | - 2. Bittorrent
 69 | 
 70 | 🔥 In order to download the checkpoints and tokenizer, use this BitTorrent link: "[magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA](magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA)".
 71 | 
 72 | 
 73 | ## 💎 Quantize LLaMA to run in a 4GB GPU
 74 | 
 75 | `pyllama` support quantization of 2/3/4/8-bit so that you can run model in a 4G memory GPU.
 76 | 
 77 | > You need to run `export HUGGING_FACE_HUB_TOKEN=XXX` to be able to access Hugging Face's data. You also need to install [gptq](https://pypi.org/project/gptq/) with command `pip install gptq`.
 78 | 
 79 | ```bash
 80 | python -m llama.llama_quant --help
 81 | usage: llama_quant.py [-h] [--ckpt_dir CKPT_DIR] [--tokenizer_path TOKENIZER_PATH] 
 82 |                       [--seed SEED] [--nsamples NSAMPLES] [--percdamp PERCDAMP]
 83 |                       [--nearest] [--wbits {2,3,4,8,16}] [--groupsize GROUPSIZE]
 84 |                       [--save SAVE] [--load LOAD] [--benchmark BENCHMARK] [--check]
 85 |                       [--cuda CUDA] [--eval]
 86 |                       {wikitext2,ptb,c4}
 87 | 
 88 | positional arguments:
 89 |   {wikitext2,ptb,c4}    Where to extract calibration data from.
 90 | 
 91 | optional arguments:
 92 |   -h, --help            show this help message and exit
 93 |   --ckpt_dir CKPT_DIR
 94 |   --tokenizer_path TOKENIZER_PATH
 95 |   --seed SEED           Seed for sampling the calibration data.
 96 |   --nsamples NSAMPLES   Number of calibration data samples.
 97 |   --percdamp PERCDAMP   Percent of the average Hessian diagonal to use for dampening.
 98 |   --nearest             Whether to run the RTN baseline.
 99 |   --wbits {2,3,4,8}  bits for quantization
100 |   --groupsize GROUPSIZE
101 |                         Groupsize to use for quantization; default uses full row.
102 |   --save SAVE           Save quantized checkpoint under this name, eg pyllama-7B4b.pt.
103 |   --load LOAD           Load quantized model.
104 |   --benchmark BENCHMARK
105 |                         Number of tokens to use for benchmarking.
106 |   --check               Whether to compute perplexity during benchmarking for verification.
107 |   --cuda CUDA           GPU device string, 'cuda:0' by default.
108 |   --eval                Evaluate the model with dataset wikitext2, ptb and c4
109 | ```
110 | 
111 | - Quantize 7B model to 8-bit
112 | 
113 | ```bash
114 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 8 --save pyllama-7B8b.pt
115 | ```
116 | 
117 | - Quantize 7B model to 4-bit with groupsize 128 (the recommended setup 🔥)
118 | 
119 | ```bash
120 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 4 --groupsize 128 --save pyllama-7B4b.pt
121 | ```
122 | 
123 | - Quantize 7B model to 2-bit
124 | 
125 | ```bash
126 | python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 2 --save pyllama-7B2b.pt
127 | ```
128 | 
129 | The download links for quantized LLaMA files are below:
130 | 
131 | - 7B
132 | 
133 | | Quant Type   |      Size      |  Link | MD5 |Loss | Password |
134 | |----------|:-------------:|------:|------:|------:|--:|
135 | | 2-bit |  2160484475 | [🔗](https://pan.baidu.com/s/1zOdKOHnSCsz6TFix2NTFtg) | 4c7215d28c1f650218c43fc46402cec5|- | 8g9d |
136 | | 3-bit |  - | - | -|- |-|
137 | | 4-bit |  3779485819 | - | cce9a3b522ddf5c011ee0174b2ff3dfb|- |-|
138 | | 8-bit |  7017493231 | - | 2648b09597cf8f9e0d1a04cb70b71cab|- |-|
139 | 
140 | 
141 | It took me 2 hours 40 mins to quantize the 65B model to 4bit. The file size is reduced from 122GB to 32GB.
142 | 
143 | > The following suggestions are recommended for LLM quantization:
144 | > 1. By default, use 4-bit quantization for LLM inference as it offers the total model bits and zero-shot accuracy trade-offs.
145 | > 2. Use a block size of 128 or lower to stabilize 4-bit quantization and improve zero-shot performance.
146 | > 3. Use a floating point or quantile quantization data type. In some cases, integer data types might be preferable to improve inference latency depending on the implementation and hardware support.
147 | 
148 | ## 🔮 Single GPU Inference
149 | 
150 | ### 🥥 Without Quantization
151 | 
152 | Set the environment variables `CKPT_DIR` as your llama model folder, for example `/llama_data/7B`, and `TOKENIZER_PATH` as your tokenizer's path, such as `/llama_data/tokenizer.model`.
153 | 
154 | And then run the following command:
155 | 
156 | ```bash
157 | python inference.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH
158 | ```
159 | 
160 | The following is an example of LLaMA running in a 8GB single GPU.
161 | 
162 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_inference.png)
163 | 
164 | ### 🥝 With Quantization
165 | 
166 | With quantization, you can run LLaMA with a 4GB memory GPU.
167 | 
168 | - pyllama can run 7B model with 6GB GPU memory.
169 | Example: ```python quant_infer.py --wbits 4 --load pyllama-7B4b.pt -- text "..." --max_length 24 --cuda cuda:0```
170 | 
171 | ![4bit-quant-6GB](https://github.com/juncongmoo/pyllama/blob/main/docs/pyllama_7B_6GB.png)
172 | 
173 | - pyllama can run 7B model with 3.2GB GPU memory.
174 | Example: ```python quant_infer.py --wbits 2 --load pyllama-7B4b.pt -- text "..." --max_length 32```
175 | 
176 | ![2bit-quant-6GB](https://github.com/juncongmoo/pyllama/blob/main/docs/pyllama_7B_3GB.png)
177 | 
178 | ### 💡 Tips
179 | 
180 | - To load KV cache in CPU, run `export KV_CAHCHE_IN_GPU=0` in the shell.
181 | 
182 | - To profile CPU/GPU/Latency, run:
183 | 
184 | ```bash
185 | python inference_driver.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH
186 | ```
187 | 
188 | A sample result is like:
189 | 
190 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_profiling.png)
191 | 
192 | - Tune `max_seq_len` and `max_batch_size` to reduce memory consumption to be able to run in GPU. Refer to: [this post](https://github.com/juncongmoo/pyllama/issues/9)!
193 | 
194 | ### 🍉 Start a gradio webui
195 | 
196 | 
197 | ```bash
198 | $ cd apps/gradio
199 | $ python webapp_single.py  --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH
200 | ```
201 | 
202 | You should see something like this in your browser:
203 | 
204 | ![LLaMA Inference](https://raw.githubusercontent.com/juncongmoo/pyllama/main/docs/llama_webui.png)
205 | 
206 | ### 🍓 Start a web server
207 | 
208 | The following command will start a flask web server:
209 | 
210 | ```bash
211 | $ cd apps/flask
212 | $ python web_server_single.py  --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH
213 | ```
214 | 
215 | ## 🍒 Multiple GPU Inference
216 | 
217 | ### 🧘‍♀️ Official Way
218 | 
219 | To use the original META's model parallel, please set environment variable `PYLLAMA_META_MP` like:
220 | 
221 | ```
222 | export PYLLAMA_META_MP=1
223 | ```
224 | 
225 | With this environment variable set, you can `import llama` and the original META version's llama will be imported.
226 | 
227 | The provided `example.py` can be run on a single or multi-gpu node with `torchrun` and will output completions for two pre-defined prompts. Using `TARGET_FOLDER` as defined in `download.sh`:
228 | 
229 | ```bash
230 | torchrun --nproc_per_node MP example.py --ckpt_dir $TARGET_FOLDER/model_size \
231 |   --tokenizer_path $TARGET_FOLDER/tokenizer.model
232 | ```
233 | 
234 | Different models require different MP values:
235 | 
236 | |  Model | MP |
237 | |--------|----|
238 | | 7B     | 1  |
239 | | 13B    | 2  |
240 | | 30B    | 4  |
241 | | 65B    | 8  |
242 | 
243 | ### 🐒 Community Way
244 | 
245 | There are two steps to run LLaMA in multi-GPU environment.
246 | 
247 | - Convert original LLaMA model
248 | 
249 | ```bash
250 | $python -m llama.convert_llama --help
251 | usage: convert_llama.py [-h] [--ckpt_dir CKPT_DIR] [--tokenizer_path TOKENIZER_PATH]
252 |                         [--model_size {7B,13B,30B,65B}] [--output_dir OUTPUT_DIR]
253 |                         [--max_batch_size MAX_BATCH_SIZE] [--to {hf,fb}]
254 | 
255 | optional arguments:
256 |   -h, --help            show this help message and exit
257 |   --ckpt_dir CKPT_DIR
258 |   --tokenizer_path TOKENIZER_PATH
259 |   --model_size {7B,13B,30B,65B}
260 |   --output_dir OUTPUT_DIR
261 |                         Location to write HF model and tokenizer
262 |   --max_batch_size MAX_BATCH_SIZE
263 |   --to {hf,fb}
264 | ```
265 | 
266 | - Run with HF's accelerate with multiple GPUs
267 | 
268 | ```bash
269 | $python -m llama.llama_multigpu --help
270 | usage: llama_multigpu.py [-h] [--state_dict_dir STATE_DICT_DIR] [--model_size {7B,13B,30B,65B}]
271 | 
272 | optional arguments:
273 |   -h, --help            show this help message and exit
274 |   --state_dict_dir STATE_DICT_DIR
275 |   --model_size {7B,13B,30B,65B}
276 | ```
277 | 
278 | ![](https://github.com/juncongmoo/pyllama/blob/main/docs/llama_multigpu.png)
279 | 
280 | ## 🔬 Model Fine Tuning
281 | 
282 | ### With [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) Instruction-Following Dataset
283 | 
284 | - Tokenization
285 | - Finetuning
286 | - Efficient FT
287 | 
288 | ## 🧬 LLaMA model structure
289 | 
290 | - Meta
291 | - Hugging Face
292 | 
293 | ```
294 | https://github.com/facebookresearch/llama/blob/main/llama/model.py#LL127C27-L127C27
295 | ```
296 | 
297 | ### Model Card
298 | 
299 | See [MODEL_CARD.md](https://github.com/juncongmoo/pyllama/blob/main/MODEL_CARD.md)
300 | 
301 | ### License
302 | 
303 | See the [LICENSE](https://github.com/juncongmoo/pyllama/blob/main/LICENSE) file.
304 | 


--------------------------------------------------------------------------------
/apps/flask/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | 
4 | 


--------------------------------------------------------------------------------
/apps/flask/web_server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
  3 | 
  4 | from typing import Tuple
  5 | import os
  6 | import sys
  7 | import argparse
  8 | import torch
  9 | import time
 10 | import json
 11 | 
 12 | from pathlib import Path
 13 | from typing import List
 14 | 
 15 | from pydantic import BaseModel
 16 | from fastapi import FastAPI
 17 | import uvicorn
 18 | import torch.distributed as dist
 19 | 
 20 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel
 21 | 
 22 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 23 | 
 24 | 
 25 | parser = argparse.ArgumentParser()
 26 | parser.add_argument("--ckpt_dir", type=str, required=True)
 27 | parser.add_argument("--tokenizer_path", type=str, required=True)
 28 | parser.add_argument("--max_seq_len", type=int, default=512)
 29 | parser.add_argument("--max_batch_size", type=int, default=1)
 30 | 
 31 | 
 32 | app = FastAPI()
 33 | 
 34 | 
 35 | def setup_model_parallel() -> Tuple[int, int]:
 36 |     local_rank = int(os.environ.get("LOCAL_RANK", -1))
 37 |     world_size = int(os.environ.get("WORLD_SIZE", -1))
 38 | 
 39 |     dist.init_process_group("nccl")
 40 |     initialize_model_parallel(world_size)
 41 |     torch.cuda.set_device(local_rank)
 42 | 
 43 |     # seed must be the same in all processes
 44 |     torch.manual_seed(1)
 45 |     return local_rank, world_size
 46 | 
 47 | 
 48 | def load(
 49 |     ckpt_dir: str,
 50 |     tokenizer_path: str,
 51 |     local_rank: int,
 52 |     world_size: int,
 53 |     max_seq_len: int,
 54 |     max_batch_size: int,
 55 | ) -> LLaMA:
 56 |     start_time = time.time()
 57 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
 58 |     assert world_size == len(
 59 |         checkpoints
 60 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
 61 |     ckpt_path = checkpoints[local_rank]
 62 |     print("Loading")
 63 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
 64 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
 65 |         params = json.loads(f.read())
 66 | 
 67 |     model_args: ModelArgs = ModelArgs(
 68 |         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
 69 |     )
 70 |     tokenizer = Tokenizer(model_path=tokenizer_path)
 71 |     model_args.vocab_size = tokenizer.n_words
 72 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
 73 |     model = Transformer(model_args)
 74 |     torch.set_default_tensor_type(torch.FloatTensor)
 75 |     model.load_state_dict(checkpoint, strict=False)
 76 | 
 77 |     generator = LLaMA(model, tokenizer)
 78 |     print(f"Loaded in {time.time() - start_time:.2f} seconds")
 79 |     return generator
 80 | 
 81 | 
 82 | def init_generator(
 83 |     ckpt_dir: str,
 84 |     tokenizer_path: str,
 85 |     max_seq_len: int = 512,
 86 |     max_batch_size: int = 32,
 87 | ):
 88 |     local_rank, world_size = setup_model_parallel()
 89 |     if local_rank > 0:
 90 |         sys.stdout = open(os.devnull, "w")
 91 | 
 92 |     generator = load(
 93 |         ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
 94 |     )
 95 | 
 96 |     return generator
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     args = parser.parse_args()
101 |     generator = init_generator(
102 |         args.ckpt_dir,
103 |         args.tokenizer_path,
104 |         args.max_seq_len,
105 |         args.max_batch_size,
106 |     )
107 | 
108 |     class Config(BaseModel):
109 |         prompts: List[str]
110 |         max_gen_len: int
111 |         temperature: float = 0.8
112 |         top_p: float = 0.95
113 | 
114 |     if dist.get_rank() == 0:
115 | 
116 |         @app.post("/llama/")
117 |         def generate(config: Config):
118 |             if len(config.prompts) > args.max_batch_size:
119 |                 return {"error": "too much prompts."}
120 |             for prompt in config.prompts:
121 |                 if len(prompt) + config.max_gen_len > args.max_seq_len:
122 |                     return {"error": "max_gen_len too large."}
123 |             dist.broadcast_object_list(
124 |                 [config.prompts, config.max_gen_len, config.temperature, config.top_p]
125 |             )
126 | 
127 |             results = generator.generate(
128 |                 config.prompts,
129 |                 max_gen_len=config.max_gen_len,
130 |                 temperature=config.temperature,
131 |                 top_p=config.top_p,
132 |             )
133 | 
134 |             return {"responses": results}
135 | 
136 |         uvicorn.run(app, host="0.0.0.0", port=8042)
137 |     else:
138 |         while True:
139 |             config = [None] * 4
140 |             try:
141 |                 dist.broadcast_object_list(config)
142 |                 generator.generate(
143 |                     config[0],
144 |                     max_gen_len=config[1],
145 |                     temperature=config[2],
146 |                     top_p=config[3],
147 |                 )
148 |             except:
149 |                 pass
150 | 


--------------------------------------------------------------------------------
/apps/flask/web_server_single.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import json
  3 | 
  4 | from pathlib import Path
  5 | from typing import List
  6 | 
  7 | from pydantic import BaseModel
  8 | from fastapi import FastAPI
  9 | import uvicorn
 10 | import torch.distributed as dist
 11 | 
 12 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 13 | 
 14 | 
 15 | def get_args():
 16 |     import argparse
 17 | 
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
 20 |     parser.add_argument(
 21 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
 22 |     )
 23 |     parser.add_argument("--max_seq_len", type=int, default=512)
 24 |     parser.add_argument("--max_batch_size", type=int, default=1)
 25 |     return parser.parse_args()
 26 | 
 27 | 
 28 | app = FastAPI()
 29 | 
 30 | 
 31 | def load(
 32 |     ckpt_dir: str,
 33 |     tokenizer_path: str,
 34 |     local_rank: int,
 35 |     world_size: int,
 36 |     max_seq_len: int,
 37 |     max_batch_size: int,
 38 | ) -> LLaMA:
 39 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
 40 |     assert world_size == len(
 41 |         checkpoints
 42 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
 43 |     ckpt_path = checkpoints[local_rank]
 44 | 
 45 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
 46 | 
 47 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
 48 |         params = json.loads(f.read())
 49 | 
 50 |     model_args: ModelArgs = ModelArgs(
 51 |         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
 52 |     )
 53 |     tokenizer = Tokenizer(model_path=tokenizer_path)
 54 |     model_args.vocab_size = tokenizer.n_words
 55 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
 56 |     model = Transformer(model_args)
 57 |     torch.set_default_tensor_type(torch.FloatTensor)
 58 |     model.load_state_dict(checkpoint, strict=False)
 59 |     generator = LLaMA(model, tokenizer)
 60 |     return generator
 61 | 
 62 | 
 63 | def init_generator(
 64 |     ckpt_dir: str,
 65 |     tokenizer_path: str,
 66 |     max_seq_len: int = 512,
 67 |     max_batch_size: int = 1,
 68 | ):
 69 |     local_rank, world_size = 0, 1
 70 |     generator = load(
 71 |         ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
 72 |     )
 73 | 
 74 |     return generator
 75 | 
 76 | 
 77 | if __name__ == "__main__":
 78 |     args = get_args()
 79 |     generator = init_generator(
 80 |         args.ckpt_dir,
 81 |         args.tokenizer_path,
 82 |         args.max_seq_len,
 83 |         args.max_batch_size,
 84 |     )
 85 | 
 86 |     class Config(BaseModel):
 87 |         prompts: List[str]
 88 |         max_gen_len: int
 89 |         temperature: float = 0.8
 90 |         top_p: float = 0.95
 91 | 
 92 |     @app.post("/llama/")
 93 |     def generate(config: Config):
 94 |         if len(config.prompts) > args.max_batch_size:
 95 |             return {"error": "too much prompts."}
 96 |         for prompt in config.prompts:
 97 |             if len(prompt) + config.max_gen_len > args.max_seq_len:
 98 |                 return {"error": "max_gen_len too large."}
 99 |         results = generator.generate(
100 |             config.prompts,
101 |             max_gen_len=config.max_gen_len,
102 |             temperature=config.temperature,
103 |             top_p=config.top_p,
104 |         )
105 |         return {"responses": results}
106 | 
107 |     uvicorn.run(app, host="0.0.0.0", port=8080)
108 | 


--------------------------------------------------------------------------------
/apps/gradio/run.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # first build the virtualenv using the virtualenv.sh script 
 3 | #
 4 | # gradio webapp.py
 5 | torchrun --nproc_per_node $MP webapp.py --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH
 6 | #
 7 | # or use CUDA_VISIBLE_DEVICES if you want to target a specific gpu device
 8 | # CUDA_VISIBLE_DEVICES=1 torchrun --nproc_per_node $MP webapp.py
 9 | #
10 | 


--------------------------------------------------------------------------------
/apps/gradio/set_up_venv.sh:
--------------------------------------------------------------------------------
1 | rm -rf llama_env
2 | python3 -m venv llama_env
3 | source llama_env/bin/activate
4 | 
5 | pip uninstall llama -U
6 | pip install pyllama -U
7 | pip install gradio
8 | 
9 | 


--------------------------------------------------------------------------------
/apps/gradio/webapp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import fire
  5 | import time
  6 | import json
  7 | 
  8 | import gradio as gr
  9 | 
 10 | from typing import Tuple
 11 | from pathlib import Path
 12 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel
 13 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 14 | 
 15 | 
 16 | def setup_model_parallel() -> Tuple[int, int]:
 17 |     local_rank = int(os.environ.get("LOCAL_RANK", -1))
 18 |     world_size = int(os.environ.get("WORLD_SIZE", -1))
 19 | 
 20 |     torch.distributed.init_process_group("nccl")
 21 |     initialize_model_parallel(world_size)
 22 |     torch.cuda.set_device(local_rank)
 23 | 
 24 |     # seed must be the same in all processes
 25 |     torch.manual_seed(1)
 26 |     return local_rank, world_size
 27 | 
 28 | 
 29 | def load(
 30 |     ckpt_dir: str,
 31 |     tokenizer_path: str,
 32 |     local_rank: int,
 33 |     world_size: int,
 34 |     max_seq_len: int,
 35 |     max_batch_size: int,
 36 | ) -> LLaMA:
 37 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
 38 |     assert world_size == len(
 39 |         checkpoints
 40 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
 41 |     ckpt_path = checkpoints[local_rank]
 42 | 
 43 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
 44 | 
 45 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
 46 |         params = json.loads(f.read())
 47 | 
 48 |     model_args: ModelArgs = ModelArgs(
 49 |         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
 50 |     )
 51 |     tokenizer = Tokenizer(model_path=tokenizer_path)
 52 |     model_args.vocab_size = tokenizer.n_words
 53 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
 54 |     model = Transformer(model_args)
 55 |     torch.set_default_tensor_type(torch.FloatTensor)
 56 |     model.load_state_dict(checkpoint, strict=False)
 57 |     generator = LLaMA(model, tokenizer)
 58 |     return generator
 59 | 
 60 | 
 61 | def process(prompt: str):
 62 |     print("Received:\n", prompt)
 63 |     prompts = [prompt]
 64 |     results = generator.generate(
 65 |         prompts, max_gen_len=256, temperature=temperature, top_p=top_p
 66 |     )
 67 |     print("Generated:\n", results[0])
 68 |     return str(results[0])
 69 | 
 70 | 
 71 | def get_args():
 72 |     import argparse
 73 | 
 74 |     parser = argparse.ArgumentParser()
 75 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
 76 |     parser.add_argument(
 77 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
 78 |     )
 79 |     return parser.parse_args()
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     args = get_args()
 84 |     ckpt_dir = args.ckpt_dir
 85 |     tokenizer_path = args.tokenizer_path
 86 |     temperature = 0.8
 87 |     top_p = 0.95
 88 |     max_seq_len = 512
 89 |     max_batch_size = 32
 90 | 
 91 |     local_rank, world_size = setup_model_parallel()
 92 |     if local_rank > 0:
 93 |         sys.stdout = open(os.devnull, "w")
 94 | 
 95 |     generator = load(
 96 |         ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
 97 |     )
 98 | 
 99 |     demo = gr.Interface(
100 |         fn=process,
101 |         inputs=gr.Textbox(lines=10, placeholder="Your prompt here..."),
102 |         outputs="text",
103 |     )
104 | 
105 |     # To create a public link, set `share=True` in `launch()`.
106 |     demo.launch(share=True)
107 | 


--------------------------------------------------------------------------------
/apps/gradio/webapp_single.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import fire
 5 | import time
 6 | import json
 7 | 
 8 | import gradio as gr
 9 | 
10 | from typing import Tuple
11 | from pathlib import Path
12 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel
13 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA
14 | 
15 | 
16 | def load(
17 |     ckpt_dir: str,
18 |     tokenizer_path: str,
19 |     local_rank: int,
20 |     world_size: int,
21 |     max_seq_len: int,
22 |     max_batch_size: int,
23 | ) -> LLaMA:
24 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
25 |     assert world_size == len(
26 |         checkpoints
27 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
28 |     ckpt_path = checkpoints[local_rank]
29 | 
30 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
31 | 
32 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
33 |         params = json.loads(f.read())
34 | 
35 |     model_args: ModelArgs = ModelArgs(
36 |         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
37 |     )
38 |     tokenizer = Tokenizer(model_path=tokenizer_path)
39 |     model_args.vocab_size = tokenizer.n_words
40 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
41 |     model = Transformer(model_args)
42 |     torch.set_default_tensor_type(torch.FloatTensor)
43 |     model.load_state_dict(checkpoint, strict=False)
44 |     generator = LLaMA(model, tokenizer)
45 |     return generator
46 | 
47 | 
48 | def process(prompt: str):
49 |     print("Received:\n", prompt)
50 |     prompts = [prompt]
51 |     results = generator.generate(
52 |         prompts, max_gen_len=256, temperature=temperature, top_p=top_p
53 |     )
54 |     print("Generated:\n", results[0])
55 |     return str(results[0])
56 | 
57 | 
58 | def get_args():
59 |     import argparse
60 | 
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
63 |     parser.add_argument(
64 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
65 |     )
66 |     return parser.parse_args()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     args = get_args()
71 |     ckpt_dir = args.ckpt_dir
72 |     tokenizer_path = args.tokenizer_path
73 |     temperature = 0.8
74 |     top_p = 0.95
75 |     max_seq_len = 512
76 |     max_batch_size = 1
77 | 
78 |     local_rank, world_size = 0, 1
79 |     generator = load(
80 |         ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
81 |     )
82 | 
83 |     demo = gr.Interface(
84 |         fn=process,
85 |         inputs=gr.Textbox(lines=10, placeholder="Your prompt here..."),
86 |         outputs="text",
87 |     )
88 | 
89 |     # To create a public link, set `share=True` in `launch()`.
90 |     demo.launch(share=True)
91 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | $python inference.py
  3 | Loading
  4 | type(checkpoint): <class 'dict'>
  5 | LLaMA Core model:
  6 |  Transformer(
  7 |   (tok_embeddings): Embedding(32000, 4096)
  8 |   (layers): ModuleList(
  9 |     (0): TransformerBlock(
 10 |       (attention): Attention(
 11 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 12 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 13 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 14 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 15 |       )
 16 |       (feed_forward): FeedForward(
 17 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 18 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 19 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 20 |       )
 21 |       (attention_norm): RMSNorm()
 22 |       (ffn_norm): RMSNorm()
 23 |     )
 24 |     (1): TransformerBlock(
 25 |       (attention): Attention(
 26 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 27 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 28 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 29 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 30 |       )
 31 |       (feed_forward): FeedForward(
 32 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 33 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 34 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 35 |       )
 36 |       (attention_norm): RMSNorm()
 37 |       (ffn_norm): RMSNorm()
 38 |     )
 39 |     (2): TransformerBlock(
 40 |       (attention): Attention(
 41 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 42 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 43 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 44 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 45 |       )
 46 |       (feed_forward): FeedForward(
 47 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 48 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 49 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 50 |       )
 51 |       (attention_norm): RMSNorm()
 52 |       (ffn_norm): RMSNorm()
 53 |     )
 54 |     (3): TransformerBlock(
 55 |       (attention): Attention(
 56 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 57 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 58 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 59 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 60 |       )
 61 |       (feed_forward): FeedForward(
 62 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 63 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 64 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 65 |       )
 66 |       (attention_norm): RMSNorm()
 67 |       (ffn_norm): RMSNorm()
 68 |     )
 69 |     (4): TransformerBlock(
 70 |       (attention): Attention(
 71 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 72 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 73 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 74 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 75 |       )
 76 |       (feed_forward): FeedForward(
 77 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 78 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 79 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 80 |       )
 81 |       (attention_norm): RMSNorm()
 82 |       (ffn_norm): RMSNorm()
 83 |     )
 84 |     (5): TransformerBlock(
 85 |       (attention): Attention(
 86 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
 87 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
 88 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
 89 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
 90 |       )
 91 |       (feed_forward): FeedForward(
 92 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
 93 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
 94 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
 95 |       )
 96 |       (attention_norm): RMSNorm()
 97 |       (ffn_norm): RMSNorm()
 98 |     )
 99 |     (6): TransformerBlock(
100 |       (attention): Attention(
101 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
102 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
103 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
104 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
105 |       )
106 |       (feed_forward): FeedForward(
107 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
108 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
109 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
110 |       )
111 |       (attention_norm): RMSNorm()
112 |       (ffn_norm): RMSNorm()
113 |     )
114 |     (7): TransformerBlock(
115 |       (attention): Attention(
116 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
117 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
118 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
119 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
120 |       )
121 |       (feed_forward): FeedForward(
122 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
123 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
124 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
125 |       )
126 |       (attention_norm): RMSNorm()
127 |       (ffn_norm): RMSNorm()
128 |     )
129 |     (8): TransformerBlock(
130 |       (attention): Attention(
131 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
132 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
133 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
134 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
135 |       )
136 |       (feed_forward): FeedForward(
137 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
138 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
139 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
140 |       )
141 |       (attention_norm): RMSNorm()
142 |       (ffn_norm): RMSNorm()
143 |     )
144 |     (9): TransformerBlock(
145 |       (attention): Attention(
146 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
147 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
148 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
149 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
150 |       )
151 |       (feed_forward): FeedForward(
152 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
153 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
154 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
155 |       )
156 |       (attention_norm): RMSNorm()
157 |       (ffn_norm): RMSNorm()
158 |     )
159 |     (10): TransformerBlock(
160 |       (attention): Attention(
161 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
162 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
163 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
164 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
165 |       )
166 |       (feed_forward): FeedForward(
167 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
168 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
169 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
170 |       )
171 |       (attention_norm): RMSNorm()
172 |       (ffn_norm): RMSNorm()
173 |     )
174 |     (11): TransformerBlock(
175 |       (attention): Attention(
176 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
177 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
178 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
179 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
180 |       )
181 |       (feed_forward): FeedForward(
182 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
183 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
184 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
185 |       )
186 |       (attention_norm): RMSNorm()
187 |       (ffn_norm): RMSNorm()
188 |     )
189 |     (12): TransformerBlock(
190 |       (attention): Attention(
191 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
192 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
193 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
194 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
195 |       )
196 |       (feed_forward): FeedForward(
197 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
198 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
199 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
200 |       )
201 |       (attention_norm): RMSNorm()
202 |       (ffn_norm): RMSNorm()
203 |     )
204 |     (13): TransformerBlock(
205 |       (attention): Attention(
206 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
207 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
208 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
209 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
210 |       )
211 |       (feed_forward): FeedForward(
212 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
213 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
214 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
215 |       )
216 |       (attention_norm): RMSNorm()
217 |       (ffn_norm): RMSNorm()
218 |     )
219 |     (14): TransformerBlock(
220 |       (attention): Attention(
221 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
222 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
223 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
224 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
225 |       )
226 |       (feed_forward): FeedForward(
227 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
228 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
229 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
230 |       )
231 |       (attention_norm): RMSNorm()
232 |       (ffn_norm): RMSNorm()
233 |     )
234 |     (15): TransformerBlock(
235 |       (attention): Attention(
236 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
237 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
238 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
239 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
240 |       )
241 |       (feed_forward): FeedForward(
242 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
243 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
244 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
245 |       )
246 |       (attention_norm): RMSNorm()
247 |       (ffn_norm): RMSNorm()
248 |     )
249 |     (16): TransformerBlock(
250 |       (attention): Attention(
251 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
252 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
253 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
254 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
255 |       )
256 |       (feed_forward): FeedForward(
257 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
258 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
259 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
260 |       )
261 |       (attention_norm): RMSNorm()
262 |       (ffn_norm): RMSNorm()
263 |     )
264 |     (17): TransformerBlock(
265 |       (attention): Attention(
266 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
267 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
268 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
269 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
270 |       )
271 |       (feed_forward): FeedForward(
272 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
273 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
274 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
275 |       )
276 |       (attention_norm): RMSNorm()
277 |       (ffn_norm): RMSNorm()
278 |     )
279 |     (18): TransformerBlock(
280 |       (attention): Attention(
281 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
282 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
283 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
284 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
285 |       )
286 |       (feed_forward): FeedForward(
287 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
288 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
289 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
290 |       )
291 |       (attention_norm): RMSNorm()
292 |       (ffn_norm): RMSNorm()
293 |     )
294 |     (19): TransformerBlock(
295 |       (attention): Attention(
296 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
297 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
298 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
299 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
300 |       )
301 |       (feed_forward): FeedForward(
302 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
303 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
304 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
305 |       )
306 |       (attention_norm): RMSNorm()
307 |       (ffn_norm): RMSNorm()
308 |     )
309 |     (20): TransformerBlock(
310 |       (attention): Attention(
311 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
312 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
313 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
314 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
315 |       )
316 |       (feed_forward): FeedForward(
317 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
318 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
319 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
320 |       )
321 |       (attention_norm): RMSNorm()
322 |       (ffn_norm): RMSNorm()
323 |     )
324 |     (21): TransformerBlock(
325 |       (attention): Attention(
326 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
327 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
328 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
329 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
330 |       )
331 |       (feed_forward): FeedForward(
332 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
333 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
334 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
335 |       )
336 |       (attention_norm): RMSNorm()
337 |       (ffn_norm): RMSNorm()
338 |     )
339 |     (22): TransformerBlock(
340 |       (attention): Attention(
341 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
342 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
343 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
344 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
345 |       )
346 |       (feed_forward): FeedForward(
347 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
348 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
349 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
350 |       )
351 |       (attention_norm): RMSNorm()
352 |       (ffn_norm): RMSNorm()
353 |     )
354 |     (23): TransformerBlock(
355 |       (attention): Attention(
356 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
357 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
358 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
359 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
360 |       )
361 |       (feed_forward): FeedForward(
362 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
363 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
364 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
365 |       )
366 |       (attention_norm): RMSNorm()
367 |       (ffn_norm): RMSNorm()
368 |     )
369 |     (24): TransformerBlock(
370 |       (attention): Attention(
371 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
372 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
373 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
374 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
375 |       )
376 |       (feed_forward): FeedForward(
377 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
378 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
379 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
380 |       )
381 |       (attention_norm): RMSNorm()
382 |       (ffn_norm): RMSNorm()
383 |     )
384 |     (25): TransformerBlock(
385 |       (attention): Attention(
386 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
387 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
388 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
389 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
390 |       )
391 |       (feed_forward): FeedForward(
392 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
393 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
394 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
395 |       )
396 |       (attention_norm): RMSNorm()
397 |       (ffn_norm): RMSNorm()
398 |     )
399 |     (26): TransformerBlock(
400 |       (attention): Attention(
401 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
402 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
403 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
404 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
405 |       )
406 |       (feed_forward): FeedForward(
407 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
408 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
409 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
410 |       )
411 |       (attention_norm): RMSNorm()
412 |       (ffn_norm): RMSNorm()
413 |     )
414 |     (27): TransformerBlock(
415 |       (attention): Attention(
416 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
417 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
418 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
419 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
420 |       )
421 |       (feed_forward): FeedForward(
422 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
423 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
424 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
425 |       )
426 |       (attention_norm): RMSNorm()
427 |       (ffn_norm): RMSNorm()
428 |     )
429 |     (28): TransformerBlock(
430 |       (attention): Attention(
431 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
432 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
433 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
434 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
435 |       )
436 |       (feed_forward): FeedForward(
437 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
438 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
439 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
440 |       )
441 |       (attention_norm): RMSNorm()
442 |       (ffn_norm): RMSNorm()
443 |     )
444 |     (29): TransformerBlock(
445 |       (attention): Attention(
446 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
447 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
448 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
449 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
450 |       )
451 |       (feed_forward): FeedForward(
452 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
453 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
454 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
455 |       )
456 |       (attention_norm): RMSNorm()
457 |       (ffn_norm): RMSNorm()
458 |     )
459 |     (30): TransformerBlock(
460 |       (attention): Attention(
461 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
462 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
463 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
464 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
465 |       )
466 |       (feed_forward): FeedForward(
467 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
468 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
469 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
470 |       )
471 |       (attention_norm): RMSNorm()
472 |       (ffn_norm): RMSNorm()
473 |     )
474 |     (31): TransformerBlock(
475 |       (attention): Attention(
476 |         (wq): Linear(in_features=4096, out_features=4096, bias=False)
477 |         (wk): Linear(in_features=4096, out_features=4096, bias=False)
478 |         (wv): Linear(in_features=4096, out_features=4096, bias=False)
479 |         (wo): Linear(in_features=4096, out_features=4096, bias=False)
480 |       )
481 |       (feed_forward): FeedForward(
482 |         (w1): Linear(in_features=4096, out_features=11008, bias=False)
483 |         (w2): Linear(in_features=11008, out_features=4096, bias=False)
484 |         (w3): Linear(in_features=4096, out_features=11008, bias=False)
485 |       )
486 |       (attention_norm): RMSNorm()
487 |       (ffn_norm): RMSNorm()
488 |     )
489 |   )
490 |   (norm): RMSNorm()
491 |   (output): Linear(in_features=4096, out_features=32000, bias=False)
492 | )
493 | Loaded in 19.94 seconds
494 | I believe the meaning of life is to appreciate everything you have.
495 | This is a journey for me to follow my heart and do what I love. I believe that life is to be lived in the moment and to give yourself the opportunity to dream and have the courage to pursue those dreams.
496 | Everything I do in my life is based on living in the moment. I think it is important to remember that we only have the moment with us. Life is fragile and there is no time to waste. We should all live in the moment and make the most of our lives.
497 | I am not a believer in good or bad. Everything that happens in our lives is the right thing for us to have at that time. We are always moving and growing and learning and life is a wonderful journey.
498 | My role as an artist is to try and depict my feelings and what I am going through. I hope that when people view my art they will also feel my emotions and connect to the pieces.
499 | I am an Australian born artist living in Perth, Australia. I work with acrylic paints, inks, gouache, oil paints and mixed media. I have been a full-time artist since 2008. I have exhibited
500 | ==================================
501 | ```
502 | 


--------------------------------------------------------------------------------
/docs/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/download.png


--------------------------------------------------------------------------------
/docs/llama_hf.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | LLaMAForCausalLM(
  3 |   (model): LLaMAModel(
  4 |     (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
  5 |     (layers): ModuleList(
  6 |       (0): LLaMADecoderLayer(
  7 |         (self_attn): LLaMAAttention(
  8 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  9 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 10 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 11 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 12 |           (rotary_emb): RotaryEmbedding()
 13 |         )
 14 |         (mlp): LLaMAMLP(
 15 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
 16 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
 17 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
 18 |           (act_fn): SiLUActivation()
 19 |         )
 20 |         (input_layernorm): RMSNorm()
 21 |         (post_attention_layernorm): RMSNorm()
 22 |       )
 23 |       (1): LLaMADecoderLayer(
 24 |         (self_attn): LLaMAAttention(
 25 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
 26 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 27 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 28 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 29 |           (rotary_emb): RotaryEmbedding()
 30 |         )
 31 |         (mlp): LLaMAMLP(
 32 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
 33 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
 34 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
 35 |           (act_fn): SiLUActivation()
 36 |         )
 37 |         (input_layernorm): RMSNorm()
 38 |         (post_attention_layernorm): RMSNorm()
 39 |       )
 40 |       (2): LLaMADecoderLayer(
 41 |         (self_attn): LLaMAAttention(
 42 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
 43 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 44 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 45 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 46 |           (rotary_emb): RotaryEmbedding()
 47 |         )
 48 |         (mlp): LLaMAMLP(
 49 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
 50 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
 51 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
 52 |           (act_fn): SiLUActivation()
 53 |         )
 54 |         (input_layernorm): RMSNorm()
 55 |         (post_attention_layernorm): RMSNorm()
 56 |       )
 57 |       (3): LLaMADecoderLayer(
 58 |         (self_attn): LLaMAAttention(
 59 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
 60 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 61 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 62 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 63 |           (rotary_emb): RotaryEmbedding()
 64 |         )
 65 |         (mlp): LLaMAMLP(
 66 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
 67 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
 68 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
 69 |           (act_fn): SiLUActivation()
 70 |         )
 71 |         (input_layernorm): RMSNorm()
 72 |         (post_attention_layernorm): RMSNorm()
 73 |       )
 74 |       (4): LLaMADecoderLayer(
 75 |         (self_attn): LLaMAAttention(
 76 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
 77 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 78 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 79 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 80 |           (rotary_emb): RotaryEmbedding()
 81 |         )
 82 |         (mlp): LLaMAMLP(
 83 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
 84 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
 85 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
 86 |           (act_fn): SiLUActivation()
 87 |         )
 88 |         (input_layernorm): RMSNorm()
 89 |         (post_attention_layernorm): RMSNorm()
 90 |       )
 91 |       (5): LLaMADecoderLayer(
 92 |         (self_attn): LLaMAAttention(
 93 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
 94 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
 95 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
 96 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
 97 |           (rotary_emb): RotaryEmbedding()
 98 |         )
 99 |         (mlp): LLaMAMLP(
100 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
101 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
102 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
103 |           (act_fn): SiLUActivation()
104 |         )
105 |         (input_layernorm): RMSNorm()
106 |         (post_attention_layernorm): RMSNorm()
107 |       )
108 |       (6): LLaMADecoderLayer(
109 |         (self_attn): LLaMAAttention(
110 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
111 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
112 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
113 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
114 |           (rotary_emb): RotaryEmbedding()
115 |         )
116 |         (mlp): LLaMAMLP(
117 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
118 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
119 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
120 |           (act_fn): SiLUActivation()
121 |         )
122 |         (input_layernorm): RMSNorm()
123 |         (post_attention_layernorm): RMSNorm()
124 |       )
125 |       (7): LLaMADecoderLayer(
126 |         (self_attn): LLaMAAttention(
127 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
128 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
129 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
130 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
131 |           (rotary_emb): RotaryEmbedding()
132 |         )
133 |         (mlp): LLaMAMLP(
134 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
135 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
136 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
137 |           (act_fn): SiLUActivation()
138 |         )
139 |         (input_layernorm): RMSNorm()
140 |         (post_attention_layernorm): RMSNorm()
141 |       )
142 |       (8): LLaMADecoderLayer(
143 |         (self_attn): LLaMAAttention(
144 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
145 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
146 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
147 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
148 |           (rotary_emb): RotaryEmbedding()
149 |         )
150 |         (mlp): LLaMAMLP(
151 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
152 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
153 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
154 |           (act_fn): SiLUActivation()
155 |         )
156 |         (input_layernorm): RMSNorm()
157 |         (post_attention_layernorm): RMSNorm()
158 |       )
159 |       (9): LLaMADecoderLayer(
160 |         (self_attn): LLaMAAttention(
161 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
162 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
163 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
164 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
165 |           (rotary_emb): RotaryEmbedding()
166 |         )
167 |         (mlp): LLaMAMLP(
168 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
169 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
170 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
171 |           (act_fn): SiLUActivation()
172 |         )
173 |         (input_layernorm): RMSNorm()
174 |         (post_attention_layernorm): RMSNorm()
175 |       )
176 |       (10): LLaMADecoderLayer(
177 |         (self_attn): LLaMAAttention(
178 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
179 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
180 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
181 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
182 |           (rotary_emb): RotaryEmbedding()
183 |         )
184 |         (mlp): LLaMAMLP(
185 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
186 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
187 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
188 |           (act_fn): SiLUActivation()
189 |         )
190 |         (input_layernorm): RMSNorm()
191 |         (post_attention_layernorm): RMSNorm()
192 |       )
193 |       (11): LLaMADecoderLayer(
194 |         (self_attn): LLaMAAttention(
195 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
196 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
197 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
198 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
199 |           (rotary_emb): RotaryEmbedding()
200 |         )
201 |         (mlp): LLaMAMLP(
202 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
203 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
204 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
205 |           (act_fn): SiLUActivation()
206 |         )
207 |         (input_layernorm): RMSNorm()
208 |         (post_attention_layernorm): RMSNorm()
209 |       )
210 |       (12): LLaMADecoderLayer(
211 |         (self_attn): LLaMAAttention(
212 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
213 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
214 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
215 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
216 |           (rotary_emb): RotaryEmbedding()
217 |         )
218 |         (mlp): LLaMAMLP(
219 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
220 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
221 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
222 |           (act_fn): SiLUActivation()
223 |         )
224 |         (input_layernorm): RMSNorm()
225 |         (post_attention_layernorm): RMSNorm()
226 |       )
227 |       (13): LLaMADecoderLayer(
228 |         (self_attn): LLaMAAttention(
229 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
230 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
231 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
232 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
233 |           (rotary_emb): RotaryEmbedding()
234 |         )
235 |         (mlp): LLaMAMLP(
236 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
237 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
238 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
239 |           (act_fn): SiLUActivation()
240 |         )
241 |         (input_layernorm): RMSNorm()
242 |         (post_attention_layernorm): RMSNorm()
243 |       )
244 |       (14): LLaMADecoderLayer(
245 |         (self_attn): LLaMAAttention(
246 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
247 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
248 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
249 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
250 |           (rotary_emb): RotaryEmbedding()
251 |         )
252 |         (mlp): LLaMAMLP(
253 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
254 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
255 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
256 |           (act_fn): SiLUActivation()
257 |         )
258 |         (input_layernorm): RMSNorm()
259 |         (post_attention_layernorm): RMSNorm()
260 |       )
261 |       (15): LLaMADecoderLayer(
262 |         (self_attn): LLaMAAttention(
263 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
264 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
265 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
266 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
267 |           (rotary_emb): RotaryEmbedding()
268 |         )
269 |         (mlp): LLaMAMLP(
270 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
271 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
272 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
273 |           (act_fn): SiLUActivation()
274 |         )
275 |         (input_layernorm): RMSNorm()
276 |         (post_attention_layernorm): RMSNorm()
277 |       )
278 |       (16): LLaMADecoderLayer(
279 |         (self_attn): LLaMAAttention(
280 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
281 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
282 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
283 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
284 |           (rotary_emb): RotaryEmbedding()
285 |         )
286 |         (mlp): LLaMAMLP(
287 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
288 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
289 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
290 |           (act_fn): SiLUActivation()
291 |         )
292 |         (input_layernorm): RMSNorm()
293 |         (post_attention_layernorm): RMSNorm()
294 |       )
295 |       (17): LLaMADecoderLayer(
296 |         (self_attn): LLaMAAttention(
297 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
298 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
299 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
300 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
301 |           (rotary_emb): RotaryEmbedding()
302 |         )
303 |         (mlp): LLaMAMLP(
304 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
305 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
306 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
307 |           (act_fn): SiLUActivation()
308 |         )
309 |         (input_layernorm): RMSNorm()
310 |         (post_attention_layernorm): RMSNorm()
311 |       )
312 |       (18): LLaMADecoderLayer(
313 |         (self_attn): LLaMAAttention(
314 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
315 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
316 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
317 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
318 |           (rotary_emb): RotaryEmbedding()
319 |         )
320 |         (mlp): LLaMAMLP(
321 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
322 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
323 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
324 |           (act_fn): SiLUActivation()
325 |         )
326 |         (input_layernorm): RMSNorm()
327 |         (post_attention_layernorm): RMSNorm()
328 |       )
329 |       (19): LLaMADecoderLayer(
330 |         (self_attn): LLaMAAttention(
331 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
332 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
333 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
334 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
335 |           (rotary_emb): RotaryEmbedding()
336 |         )
337 |         (mlp): LLaMAMLP(
338 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
339 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
340 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
341 |           (act_fn): SiLUActivation()
342 |         )
343 |         (input_layernorm): RMSNorm()
344 |         (post_attention_layernorm): RMSNorm()
345 |       )
346 |       (20): LLaMADecoderLayer(
347 |         (self_attn): LLaMAAttention(
348 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
349 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
350 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
351 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
352 |           (rotary_emb): RotaryEmbedding()
353 |         )
354 |         (mlp): LLaMAMLP(
355 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
356 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
357 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
358 |           (act_fn): SiLUActivation()
359 |         )
360 |         (input_layernorm): RMSNorm()
361 |         (post_attention_layernorm): RMSNorm()
362 |       )
363 |       (21): LLaMADecoderLayer(
364 |         (self_attn): LLaMAAttention(
365 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
366 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
367 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
368 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
369 |           (rotary_emb): RotaryEmbedding()
370 |         )
371 |         (mlp): LLaMAMLP(
372 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
373 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
374 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
375 |           (act_fn): SiLUActivation()
376 |         )
377 |         (input_layernorm): RMSNorm()
378 |         (post_attention_layernorm): RMSNorm()
379 |       )
380 |       (22): LLaMADecoderLayer(
381 |         (self_attn): LLaMAAttention(
382 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
383 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
384 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
385 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
386 |           (rotary_emb): RotaryEmbedding()
387 |         )
388 |         (mlp): LLaMAMLP(
389 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
390 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
391 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
392 |           (act_fn): SiLUActivation()
393 |         )
394 |         (input_layernorm): RMSNorm()
395 |         (post_attention_layernorm): RMSNorm()
396 |       )
397 |       (23): LLaMADecoderLayer(
398 |         (self_attn): LLaMAAttention(
399 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
400 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
401 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
402 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
403 |           (rotary_emb): RotaryEmbedding()
404 |         )
405 |         (mlp): LLaMAMLP(
406 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
407 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
408 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
409 |           (act_fn): SiLUActivation()
410 |         )
411 |         (input_layernorm): RMSNorm()
412 |         (post_attention_layernorm): RMSNorm()
413 |       )
414 |       (24): LLaMADecoderLayer(
415 |         (self_attn): LLaMAAttention(
416 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
417 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
418 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
419 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
420 |           (rotary_emb): RotaryEmbedding()
421 |         )
422 |         (mlp): LLaMAMLP(
423 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
424 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
425 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
426 |           (act_fn): SiLUActivation()
427 |         )
428 |         (input_layernorm): RMSNorm()
429 |         (post_attention_layernorm): RMSNorm()
430 |       )
431 |       (25): LLaMADecoderLayer(
432 |         (self_attn): LLaMAAttention(
433 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
434 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
435 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
436 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
437 |           (rotary_emb): RotaryEmbedding()
438 |         )
439 |         (mlp): LLaMAMLP(
440 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
441 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
442 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
443 |           (act_fn): SiLUActivation()
444 |         )
445 |         (input_layernorm): RMSNorm()
446 |         (post_attention_layernorm): RMSNorm()
447 |       )
448 |       (26): LLaMADecoderLayer(
449 |         (self_attn): LLaMAAttention(
450 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
451 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
452 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
453 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
454 |           (rotary_emb): RotaryEmbedding()
455 |         )
456 |         (mlp): LLaMAMLP(
457 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
458 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
459 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
460 |           (act_fn): SiLUActivation()
461 |         )
462 |         (input_layernorm): RMSNorm()
463 |         (post_attention_layernorm): RMSNorm()
464 |       )
465 |       (27): LLaMADecoderLayer(
466 |         (self_attn): LLaMAAttention(
467 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
468 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
469 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
470 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
471 |           (rotary_emb): RotaryEmbedding()
472 |         )
473 |         (mlp): LLaMAMLP(
474 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
475 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
476 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
477 |           (act_fn): SiLUActivation()
478 |         )
479 |         (input_layernorm): RMSNorm()
480 |         (post_attention_layernorm): RMSNorm()
481 |       )
482 |       (28): LLaMADecoderLayer(
483 |         (self_attn): LLaMAAttention(
484 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
485 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
486 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
487 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
488 |           (rotary_emb): RotaryEmbedding()
489 |         )
490 |         (mlp): LLaMAMLP(
491 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
492 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
493 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
494 |           (act_fn): SiLUActivation()
495 |         )
496 |         (input_layernorm): RMSNorm()
497 |         (post_attention_layernorm): RMSNorm()
498 |       )
499 |       (29): LLaMADecoderLayer(
500 |         (self_attn): LLaMAAttention(
501 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
502 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
503 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
504 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
505 |           (rotary_emb): RotaryEmbedding()
506 |         )
507 |         (mlp): LLaMAMLP(
508 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
509 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
510 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
511 |           (act_fn): SiLUActivation()
512 |         )
513 |         (input_layernorm): RMSNorm()
514 |         (post_attention_layernorm): RMSNorm()
515 |       )
516 |       (30): LLaMADecoderLayer(
517 |         (self_attn): LLaMAAttention(
518 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
519 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
520 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
521 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
522 |           (rotary_emb): RotaryEmbedding()
523 |         )
524 |         (mlp): LLaMAMLP(
525 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
526 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
527 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
528 |           (act_fn): SiLUActivation()
529 |         )
530 |         (input_layernorm): RMSNorm()
531 |         (post_attention_layernorm): RMSNorm()
532 |       )
533 |       (31): LLaMADecoderLayer(
534 |         (self_attn): LLaMAAttention(
535 |           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
536 |           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
537 |           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
538 |           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
539 |           (rotary_emb): RotaryEmbedding()
540 |         )
541 |         (mlp): LLaMAMLP(
542 |           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
543 |           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
544 |           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
545 |           (act_fn): SiLUActivation()
546 |         )
547 |         (input_layernorm): RMSNorm()
548 |         (post_attention_layernorm): RMSNorm()
549 |       )
550 |     )
551 |     (norm): RMSNorm()
552 |   )
553 |   (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
554 | )
555 | ```
556 | 


--------------------------------------------------------------------------------
/docs/llama_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_inference.png


--------------------------------------------------------------------------------
/docs/llama_multigpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_multigpu.png


--------------------------------------------------------------------------------
/docs/llama_profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_profiling.png


--------------------------------------------------------------------------------
/docs/llama_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/llama_webui.png


--------------------------------------------------------------------------------
/docs/pyllama_7B_3GB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/pyllama_7B_3GB.png


--------------------------------------------------------------------------------
/docs/pyllama_7B_6GB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/henrywoo/pyllama/9dca874d11ca2dbb6ebe4ffe48424f05f20a57eb/docs/pyllama_7B_6GB.png


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 3 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 4 | 
 5 | PRESIGNED_URL="" # edit this with the presigned url
 6 | MODEL_SIZE="7B,13B,30B,65B"             # edit this list with the model sizes you wish to download
 7 | TARGET_FOLDER=""             # where all files should end up 
 8 | 
 9 | declare -A N_SHARD_DICT
10 | 
11 | N_SHARD_DICT["7B"]="0"
12 | N_SHARD_DICT["13B"]="1"
13 | N_SHARD_DICT["30B"]="3"
14 | N_SHARD_DICT["65B"]="7"
15 | 
16 | echo "Downloading tokenizer"
17 | if cd ${TARGET_FOLDER} && [[ ! -f tokenizer.model ]] && [[ ! -f tokenizer_checklist.chk ]] && ! md5sum -c tokenizer_checklist.chk; then
18 |     wget ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model"
19 |     wget ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk"
20 |     (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk)
21 | else
22 |     echo "Skipping downloading tokenizer, already exists and checksum matches"
23 | fi
24 | 
25 | for i in ${MODEL_SIZE//,/ }
26 | do
27 | 
28 |     echo "Downloading ${i}"
29 |     mkdir -p ${TARGET_FOLDER}"/${i}"
30 | 
31 |     file_name="${TARGET_FOLDER}/${i}/checklist.chk"
32 |     echo "Downloading ${file_name}"
33 |     if ! [[ -f "${file_name}" ]]; then
34 |         wget ${PRESIGNED_URL/'*'/"${i}/checklist.chk"} -O ${TARGET_FOLDER}"/${i}/checklist.chk"
35 |     else
36 |         echo "Skipping downloading ${file_name}, already exists"
37 |     fi
38 |     for s in $(seq -f "0%g" 0 ${N_SHARD_DICT})
39 |     do
40 |         echo $s
41 |         file_name="consolidated.${s}.pth"
42 |         echo $file_name
43 |         checklist_file="${TARGET_FOLDER}/${i}/checklist.chk"
44 |         echo "${checklist_file##*/}"
45 |         checksum=$(grep "${file_name##*/}" "${checklist_file}" | cut -d' ' -f1)
46 |         # echo $(cd ${TARGET_FOLDER}"/${i}" && md5sum 'consolidated.00.pth' | cut -d' ' -f1)
47 | 
48 |         if cd "${TARGET_FOLDER}/${i}" && ! [[ -f "${file_name}" ]] || ! [[ $(md5sum "${file_name}" | cut -d' ' -f1) == "${checksum}" ]]; then
49 |             wget ${PRESIGNED_URL/'*'/"${i}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${i}/consolidated.${s}.pth"
50 |         else
51 |             echo "Skipping downloading ${file_name}, already exists and checksum matches"
52 |         fi
53 |     done
54 |     file_name="params.json"
55 |     if cd ${TARGET_FOLDER}/${i} && ! [[ -f "${file_name}" ]]; then
56 |         wget ${PRESIGNED_URL/'*'/"${i}/params.json"} -O ${TARGET_FOLDER}"/${i}/params.json"
57 |     else
58 |         echo "Skipping downloading ${file_name}, already exists"
59 |     fi
60 |     (cd ${TARGET_FOLDER}"/${i}" && md5sum -c checklist.chk)
61 | done
62 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from typing import Tuple
 5 | import os
 6 | import sys
 7 | import torch
 8 | import fire
 9 | import time
10 | import json
11 | 
12 | from pathlib import Path
13 | 
14 | from fairscale.nn.model_parallel.initialize import initialize_model_parallel
15 | 
16 | from llama import Tokenizer, LLaMA
17 | from llama.model_parallel import ModelArgs, Transformer
18 | 
19 | 
20 | def setup_model_parallel() -> Tuple[int, int]:
21 |     local_rank = int(os.environ.get("LOCAL_RANK", -1))
22 |     world_size = int(os.environ.get("WORLD_SIZE", -1))
23 | 
24 |     torch.distributed.init_process_group("nccl")
25 |     initialize_model_parallel(world_size)
26 |     torch.cuda.set_device(local_rank)
27 | 
28 |     # seed must be the same in all processes
29 |     torch.manual_seed(1)
30 |     return local_rank, world_size
31 | 
32 | 
33 | def load(ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int) -> LLaMA:
34 |     start_time = time.time()
35 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
36 |     assert world_size == len(
37 |         checkpoints
38 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
39 |     ckpt_path = checkpoints[local_rank]
40 |     print("Loading")
41 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
42 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
43 |         params = json.loads(f.read())
44 | 
45 |     model_args: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=32, **params)
46 |     tokenizer = Tokenizer(model_path=tokenizer_path)
47 |     model_args.vocab_size = tokenizer.n_words
48 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
49 |     model = Transformer(model_args)
50 |     torch.set_default_tensor_type(torch.FloatTensor)
51 |     model.load_state_dict(checkpoint, strict=False)
52 | 
53 |     generator = LLaMA(model, tokenizer)
54 |     print(f"Loaded in {time.time() - start_time:.2f} seconds")
55 |     return generator
56 | 
57 | 
58 | def main(
59 |     ckpt_dir: str, tokenizer_path: str, temperature: float = 0.8, top_p: float = 0.95
60 | ):
61 |     local_rank, world_size = setup_model_parallel()
62 |     if local_rank > 0:
63 |         sys.stdout = open(os.devnull, "w")
64 | 
65 |     generator = load(ckpt_dir, tokenizer_path, local_rank, world_size)
66 |     prompts = [
67 |         "The capital of Germany is the city of",
68 |         "Here is my sonnet in the style of Shakespeare about an artificial intelligence:",
69 |     ]
70 |     results = generator.generate(
71 |         prompts, max_gen_len=256, temperature=temperature, top_p=top_p
72 |     )
73 | 
74 |     for result in results:
75 |         print(result)
76 |         print("\n==================================\n")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     fire.Fire(main)
81 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 6 | 
 7 | 
 8 | def load(
 9 |     ckpt_dir: str,
10 |     tokenizer_path: str,
11 |     local_rank: int,
12 |     world_size: int,
13 |     max_seq_len: int,
14 |     max_batch_size: int,
15 | ) -> LLaMA:
16 |     checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
17 |     assert world_size == len(
18 |         checkpoints
19 |     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
20 |     ckpt_path = checkpoints[local_rank]
21 | 
22 |     checkpoint = torch.load(ckpt_path, map_location="cpu")
23 | 
24 |     with open(Path(ckpt_dir) / "params.json", "r") as f:
25 |         params = json.loads(f.read())
26 | 
27 |     model_args: ModelArgs = ModelArgs(
28 |         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
29 |     )
30 |     tokenizer = Tokenizer(model_path=tokenizer_path)
31 |     model_args.vocab_size = tokenizer.n_words
32 |     torch.set_default_tensor_type(torch.cuda.HalfTensor)
33 |     model = Transformer(model_args)
34 |     torch.set_default_tensor_type(torch.FloatTensor)
35 |     model.load_state_dict(checkpoint, strict=False)
36 |     generator = LLaMA(model, tokenizer)
37 |     return generator
38 | 
39 | 
40 | def run(
41 |     ckpt_dir: str,
42 |     tokenizer_path: str,
43 |     temperature: float = 0.8,
44 |     top_p: float = 0.95,
45 |     max_seq_len: int = 1024,
46 |     max_batch_size: int = 1,
47 | ):
48 |     local_rank = 0
49 |     world_size = 1
50 |     generator = load(
51 |         ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
52 |     )
53 |     prompts = [
54 |         # For these prompts, the expected answer is the natural continuation of the prompt
55 |         "I believe the meaning of life is",  # removed: keep only one prompt
56 |     ]
57 |     while True:
58 |         print("Prompt:", prompts)
59 |         results = generator.generate(
60 |             prompts, max_gen_len=256, temperature=temperature, top_p=top_p
61 |         )
62 |         for result in results:
63 |             print("🦙LLaMA:", result.strip())
64 | 
65 |         user_input = input("please enter your prompts (Ctrl+C to exit): ")
66 |         prompts = [user_input]
67 | 
68 | 
69 | def get_args():
70 |     import argparse
71 | 
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
74 |     parser.add_argument(
75 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
76 |     )
77 |     return parser.parse_args()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     args = get_args()
82 |     run(
83 |         ckpt_dir=args.ckpt_dir,
84 |         tokenizer_path=args.tokenizer_path,
85 |         temperature=0.8,
86 |         top_p=0.95,
87 |         max_seq_len=1024,
88 |         max_batch_size=1,
89 |     )
90 | 


--------------------------------------------------------------------------------
/inference_driver.py:
--------------------------------------------------------------------------------
 1 | import hiq, time
 2 | from hiq.memory import total_gpu_memory_mb, get_memory_mb
 3 | 
 4 | 
 5 | def run_main():
 6 |     driver = hiq.HiQLatency(
 7 |         hiq_table_or_path=[
 8 |             ["inference", "", "load", "load_llama"],
 9 |             ["llama.generation", "LLaMA", "generate", "generate"],
10 |             # ["llama.model_single", "Transformer", "forward", "forward"],
11 |         ],
12 |         metric_funcs=[time.time, total_gpu_memory_mb, get_memory_mb],
13 |         # extra_metrics={hiq.ExtraMetrics.ARGS},
14 |     )
15 | 
16 |     args = hiq.mod("inference").get_args()
17 |     hiq.mod("inference").run(args.ckpt_dir, args.tokenizer_path)
18 |     print("*" * 30, "GPU/CPU/Latency Profiling", "*" * 30)
19 |     driver.show()
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     run_main()
24 | 


--------------------------------------------------------------------------------
/llama/__init__.py:
--------------------------------------------------------------------------------
 1 | from .generation import LLaMA
 2 | 
 3 | 
 4 | def pyllama_env(x, default=None) -> bool:
 5 |     import os, ast
 6 |     t = os.environ.get(x, default)
 7 |     if isinstance(t, str) and t:
 8 |         try:
 9 |             return bool(ast.literal_eval(t))
10 |         except:
11 |             return True
12 |     return bool(t)
13 | 
14 | if pyllama_env("PYLLAMA_META_MP"):
15 |     from .model_parallel import ModelArgs, Transformer
16 | else:
17 |     from .model_single import ModelArgs, Transformer
18 | from .tokenizer import Tokenizer
19 | 
20 | __version__ = "0.0.2"
21 | 


--------------------------------------------------------------------------------
/llama/convert_llama.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import argparse
 15 | import json
 16 | import os
 17 | import shutil
 18 | 
 19 | import torch
 20 | import hiq
 21 | 
 22 | """
 23 | Sample usage:
 24 | 
 25 |     ```
 26 |     python -m llama.convert_llama --ckpt_dir $CKPT_DIR --tokenizer_path $TOKENIZER_PATH \
 27 |         --model 7B --output_dir converted_meta --to fb --max_batch_size 4
 28 |     ```
 29 | 
 30 | Thereafter, models can be loaded via:
 31 | 
 32 |     ```
 33 |     tokenizer = llama.hf.LLaMATokenizer.from_pretrained("/output/path/tokenizer/")
 34 |     model = llama.hf.LLaMAForCausalLM.from_pretrained("/output/path/llama-7b/")
 35 |     ```
 36 | """
 37 | 
 38 | INTERMEDIATE_SIZE_MAP = {
 39 |     "7B": 11008,
 40 |     "13B": 13824,
 41 |     "30B": 17920,
 42 |     "65B": 22016,
 43 | }
 44 | NUM_SHARDS = {
 45 |     "7B": 1,
 46 |     "13B": 2,
 47 |     "30B": 4,
 48 |     "65B": 8,
 49 | }
 50 | META_KEY_TO_DIM = {"w1": 0, "w2": -1, "w3": 0, "wo": -1, "wq": 0, "wk": 0, "wv": 0, "output": 0, "tok_embeddings": -1,
 51 |                   "ffn_norm": None, "attention_norm": None, "norm": None, "rope": None}
 52 | 
 53 | def write_json(text, path):
 54 |     with open(path, "w") as f:
 55 |         json.dump(text, f)
 56 | 
 57 | 
 58 | def write_model(model_path, input_base_path, model_size):
 59 |     assert model_size in INTERMEDIATE_SIZE_MAP
 60 |     os.makedirs(model_path, exist_ok=True)
 61 | 
 62 |     params = hiq.read_file(os.path.join(input_base_path, "params.json"), as_json=True)
 63 |     num_shards = NUM_SHARDS[model_size]
 64 |     n_layers = params["n_layers"]
 65 |     n_heads = params["n_heads"]
 66 |     n_heads_per_shard = n_heads // num_shards
 67 |     dim = params["dim"]
 68 |     dims_per_head = dim // n_heads
 69 |     base = 10000.0
 70 |     inv_freq = 1.0 / (
 71 |             base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
 72 |     )
 73 | 
 74 |     # permute for sliced rotary
 75 |     def permute(w):
 76 |         return (
 77 |             w.view(n_heads, dim // n_heads // 2, 2, dim)
 78 |             .transpose(1, 2)
 79 |             .reshape(dim, dim)
 80 |         )
 81 | 
 82 |     # Load weights
 83 |     if model_size == "7B":
 84 |         # Not shared
 85 |         # (The sharded implementation would also work, but this is simpler.)
 86 |         loaded = torch.load(
 87 |             os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu"
 88 |         )
 89 |     else:
 90 |         # Sharded
 91 |         loaded = [
 92 |             torch.load(
 93 |                 os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
 94 |                 map_location="cpu",
 95 |             )
 96 |             for i in range(num_shards)
 97 |         ]
 98 |     param_count = 0
 99 |     index_dict = {"weight_map": {}}
100 |     for layer_i in range(n_layers):
101 |         filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
102 |             layer_i + 1,
103 |             n_layers + 1,
104 |         )
105 |         if model_size == "7B":
106 |             # Unsharded
107 |             state_dict = {
108 |                 f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
109 |                     loaded[f"layers.{layer_i}.attention.wq.weight"]
110 |                 ),
111 |                 f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
112 |                     loaded[f"layers.{layer_i}.attention.wk.weight"]
113 |                 ),
114 |                 f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
115 |                     f"layers.{layer_i}.attention.wv.weight"
116 |                 ],
117 |                 f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
118 |                     f"layers.{layer_i}.attention.wo.weight"
119 |                 ],
120 |                 f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
121 |                     f"layers.{layer_i}.feed_forward.w1.weight"
122 |                 ],
123 |                 f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[
124 |                     f"layers.{layer_i}.feed_forward.w2.weight"
125 |                 ],
126 |                 f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[
127 |                     f"layers.{layer_i}.feed_forward.w3.weight"
128 |                 ],
129 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[
130 |                     f"layers.{layer_i}.attention_norm.weight"
131 |                 ],
132 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
133 |                     f"layers.{layer_i}.ffn_norm.weight"
134 |                 ],
135 |             }
136 |         else:
137 |             # Sharded
138 |             state_dict = {
139 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
140 |                     f"layers.{layer_i}.attention_norm.weight"
141 |                 ],
142 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
143 |                     f"layers.{layer_i}.ffn_norm.weight"
144 |                 ],
145 |             }
146 |             state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
147 |                 torch.cat(
148 |                     [
149 |                         loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(
150 |                             n_heads_per_shard, dims_per_head, dim
151 |                         )
152 |                         for i in range(num_shards)
153 |                     ],
154 |                     dim=0,
155 |                 ).reshape(dim, dim)
156 |             )
157 |             state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
158 |                 torch.cat(
159 |                     [
160 |                         loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
161 |                             n_heads_per_shard, dims_per_head, dim
162 |                         )
163 |                         for i in range(num_shards)
164 |                     ],
165 |                     dim=0,
166 |                 ).reshape(dim, dim)
167 |             )
168 |             state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
169 |                 [
170 |                     loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
171 |                         n_heads_per_shard, dims_per_head, dim
172 |                     )
173 |                     for i in range(num_shards)
174 |                 ],
175 |                 dim=0,
176 |             ).reshape(dim, dim)
177 | 
178 |             state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
179 |                 [
180 |                     loaded[i][f"layers.{layer_i}.attention.wo.weight"]
181 |                     for i in range(num_shards)
182 |                 ],
183 |                 dim=1,
184 |             )
185 |             state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
186 |                 [
187 |                     loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"]
188 |                     for i in range(num_shards)
189 |                 ],
190 |                 dim=0,
191 |             )
192 |             state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
193 |                 [
194 |                     loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"]
195 |                     for i in range(num_shards)
196 |                 ],
197 |                 dim=1,
198 |             )
199 |             state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
200 |                 [
201 |                     loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"]
202 |                     for i in range(num_shards)
203 |                 ],
204 |                 dim=0,
205 |             )
206 | 
207 |         state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
208 |         for k, v in state_dict.items():
209 |             index_dict["weight_map"][k] = filename
210 |             param_count += v.numel()
211 |         torch.save(state_dict, os.path.join(model_path, filename))
212 | 
213 |     filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
214 |         n_layers + 1,
215 |         n_layers + 1,
216 |     )
217 |     if model_size == "7B":
218 |         # Unsharded
219 |         state_dict = {
220 |             "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
221 |             "model.norm.weight": loaded["norm.weight"],
222 |             "lm_head.weight": loaded["output.weight"],
223 |         }
224 |     else:
225 |         state_dict = {
226 |             "model.norm.weight": loaded[0]["norm.weight"],
227 |             "model.embed_tokens.weight": torch.cat(
228 |                 [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
229 |             ),
230 |             "lm_head.weight": torch.cat(
231 |                 [loaded[i]["output.weight"] for i in range(num_shards)], dim=0
232 |             ),
233 |         }
234 | 
235 |     for k, v in state_dict.items():
236 |         index_dict["weight_map"][k] = filename
237 |         param_count += v.numel()
238 |     torch.save(state_dict, os.path.join(model_path, filename))
239 | 
240 |     # Write configs
241 |     index_dict["metadata"] = {"total_size": param_count * 2}
242 |     write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
243 |     config_out = {
244 |         "architectures": ["LLaMAForCausalLM"],
245 |         "bos_token_id": 0,
246 |         "eos_token_id": 1,
247 |         "hidden_act": "silu",
248 |         "hidden_size": params["dim"],
249 |         "intermediate_size": INTERMEDIATE_SIZE_MAP[model_size],
250 |         "initializer_range": 0.02,
251 |         "max_sequence_length": 2048,
252 |         "model_type": "llama",
253 |         "num_attention_heads": params["n_heads"],
254 |         "num_hidden_layers": params["n_layers"],
255 |         "pad_token_id": -1,
256 |         "rms_norm_eps": params["norm_eps"],
257 |         "torch_dtype": "float16",
258 |         "transformers_version": "4.27.0.dev0",
259 |         "use_cache": True,
260 |         "vocab_size": 32000,
261 |     }
262 |     write_json(
263 |         config_out,
264 |         os.path.join(model_path, "config.json"),
265 |     )
266 |     generation_config = {
267 |         "_from_model_config": True,
268 |         "bos_token_id": 0,
269 |         "eos_token_id": 1,
270 |         "pad_token_id": 0,
271 |         "transformers_version": "4.27.0.dev0",
272 |     }
273 |     write_json(
274 |         generation_config,
275 |         os.path.join(model_path, "generation_config.json"),
276 |     )
277 | 
278 | 
279 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
280 |     os.makedirs(tokenizer_path, exist_ok=True)
281 |     write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
282 |     write_json(
283 |         {
284 |             "bos_token": "",
285 |             "eos_token": "",
286 |             "model_max_length": int(1e30),
287 |             "tokenizer_class": "LLaMATokenizer",
288 |             "unk_token": "",
289 |         },
290 |         os.path.join(tokenizer_path, "tokenizer_config.json"),
291 |     )
292 |     shutil.copyfile(
293 |         input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model")
294 |     )
295 | 
296 | 
297 | def convert_llama_fb(args):
298 |     from pathlib import Path
299 |     from tqdm import tqdm
300 |     from llama import ModelArgs, Tokenizer, Transformer
301 |     output_dir = os.path.join(args.output_dir, args.model_size)
302 |     os.makedirs(output_dir, exist_ok=True)
303 | 
304 |     if "tokenizer.model" not in os.listdir(output_dir):
305 |         shutil.copy(args.tokenizer_path, args.output_dir)
306 | 
307 |     tokenizer_path = os.path.join(args.output_dir, "tokenizer.model")
308 | 
309 |     cks = sorted(Path(args.ckpt_dir).glob("*.pth"))
310 |     params = hiq.read_file(Path(args.ckpt_dir) / "params.json",as_json=True)
311 |     model_args = ModelArgs(max_seq_len=2048, max_batch_size=args.max_batch_size, **params)
312 |     tokenizer = Tokenizer(model_path=tokenizer_path)
313 |     model_args.vocab_size = tokenizer.n_words
314 | 
315 |     torch.set_default_tensor_type(torch.HalfTensor)
316 |     print(f"⌛️ Loading model...Thank you for your patience...")
317 |     model = Transformer(model_args)
318 |     torch.set_default_tensor_type(torch.FloatTensor)
319 |     dt = {}
320 |     print(f"⌛️ Converting model...Thank you for your patience...")
321 |     for i, ckpt in tqdm(enumerate(cks), total=len(cks)):
322 |         ck = torch.load(ckpt, map_location="cpu")
323 |         for nm, pm in model.named_parameters():
324 |             if nm not in dt:
325 |                 dt[nm] = torch.zeros_like(pm, device="cpu")
326 |             short_name = nm.split(".")[-2]
327 |             if META_KEY_TO_DIM[short_name] is None and i == 0:
328 |                 dt[nm] = ck[nm]
329 |             elif META_KEY_TO_DIM[short_name] == 0:
330 |                 size = ck[nm].size(0)
331 |                 dt[nm][size * i: size * (i + 1), :] = ck[nm]
332 |             elif META_KEY_TO_DIM[short_name] == -1:
333 |                 size = ck[nm].size(-1)
334 |                 dt[nm][:, size * i: size * (i + 1)] = ck[nm]
335 |     hiq.write_file(os.path.join(output_dir, "params.json"), json.dumps(params, indent=4))
336 |     torch.save(dt, os.path.join(output_dir, "state_dict.pt"))
337 | 
338 | 
339 | def convert_llama_hf(args):
340 |     write_model(
341 |         model_path=os.path.join(
342 |             args.output_dir, "llama-{}".format(args.model_size).lower()
343 |         ),
344 |         input_base_path=args.ckpt_dir,
345 |         model_size=args.model_size,
346 |     )
347 |     write_tokenizer(
348 |         tokenizer_path=os.path.join(args.output_dir, "tokenizer"),
349 |         input_tokenizer_path=args.tokenizer_path,
350 |     )
351 | 
352 | def get_args():
353 |     parser = argparse.ArgumentParser()
354 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
355 |     parser.add_argument(
356 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
357 |     )
358 |     parser.add_argument(
359 |         "--model_size",
360 |         choices=NUM_SHARDS.keys(),
361 |     )
362 |     parser.add_argument(
363 |         "--output_dir",
364 |         help="Location to write HF model and tokenizer",
365 |     )
366 |     parser.add_argument(
367 |         "--max_batch_size", type=int, default=2
368 |     )
369 |     parser.add_argument("--to", choices={"fb", "hf"})
370 |     return parser.parse_args()
371 |     
372 | 
373 | 
374 | if __name__ == "__main__":
375 |     args = get_args()
376 |     if args.to == "hf":
377 |         convert_llama_hf(args)
378 |     elif args.to == "fb":
379 |         convert_llama_fb(args)
380 |     else:
381 |         print(f"wrong argument: {args.to}")
382 | 


--------------------------------------------------------------------------------
/llama/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from threading import Thread
 3 | 
 4 | 
 5 | here = os.path.dirname(os.path.realpath(__file__))
 6 | 
 7 | 
 8 | def download(args=None):
 9 |     import hiq
10 | 
11 |     cmd = f"bash {here}/download_community.sh"
12 |     if args is not None:
13 |         if args.model_size:
14 |             cmd += f" {args.model_size}"
15 |         if args.folder:
16 |             cmd += f" {args.folder}"
17 |     retcode = hiq.execute_cmd(cmd, verbose=False, shell=True, runtime_output=True, env=os.environ)
18 |     if retcode != 0:
19 |         # retry
20 |         download(args)
21 | 
22 | 
23 | def download_watchdog(args):
24 |     def watch():
25 |         import time
26 | 
27 |         # every 120s, check total file size under folder to see if it increases as the download speed suggests. if not, restart download
28 |         folder = args.folder if args.folder else "pyllama_data"
29 |         last_total_size = -1
30 |         while True:
31 |             total_size = 0
32 |             for dirpath, _, filenames in os.walk(folder):
33 |                 for f in filenames:
34 |                     fp = os.path.join(dirpath, f)
35 |                     total_size += os.path.getsize(fp)
36 |             size_changed_mb = (total_size - last_total_size) / 1024 / 1024
37 |             if last_total_size != -1 and size_changed_mb < 30 * args.download_speed_mb:
38 |                 print(
39 |                     f"Download watchdog: total file size {total_size / 1024 / 1024:.2f}MB increased too slow ({size_changed_mb:.2f}MB in the last 30s), restarting download"
40 |                 )
41 |                 import hiq
42 | 
43 |                 cmd = f"bash {here}/download_community_stop.sh"
44 |                 hiq.execute_cmd(cmd, verbose=False, shell=True, runtime_output=True)
45 |             else:
46 |                 if last_total_size != -1:
47 |                     print(
48 |                         f"Download watchdog: total file size increased normally at speed {size_changed_mb / 30:.2f}MB/s"
49 |                     )
50 |                 last_total_size = total_size
51 |             time.sleep(120)
52 | 
53 |     watch_thread = Thread(target=watch, daemon=True)
54 |     watch_thread.start()
55 | 
56 | 
57 | def get_args():
58 |     import argparse
59 | 
60 |     parser = argparse.ArgumentParser()
61 | 
62 |     parser.add_argument(
63 |         "--model_size",
64 |         type=str,
65 |         default="7B,13B,30B,65B",
66 |         help='The size of the models that you want to download. A comma separated string of any of "7B", "13B", "30B", "65B". Totally 219G disk space is needed to download them all. If you only want to download the 7B model, just put "7B" here.',
67 |     )
68 |     parser.add_argument(
69 |         "--folder",
70 |         type=str,
71 |         default="pyllama_data",
72 |         help="The target folder for the download files",
73 |     )
74 |     parser.add_argument(
75 |         "--download_speed_mb",
76 |         type=int,
77 |         default=1,
78 |         help="The accepted download speed in MB/s. If the download speed is lower than this, the download will be restarted.",
79 |     )
80 |     args = parser.parse_args()
81 |     return args
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     args = get_args()
86 |     download_watchdog(args)
87 |     download(args)
88 | 


--------------------------------------------------------------------------------
/llama/download_community.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -euo pipefail
  3 | 
  4 | PRESIGNED_URL="https://agi.gpt4.org/llama/LLaMA"
  5 | ALL_MODELS=7B,13B,30B,65B
  6 | 
  7 | YELLOW=$(tput setaf 3)
  8 | RED=$(tput setaf 1)
  9 | CLEAR=$(tput sgr0)
 10 | 
 11 | function usage {
 12 |     cat <<EOF
 13 | Usage: download_community [-vh] [<models>] [<output_directory>]
 14 | 
 15 | Download the given llama <models> to <output_directory>. By default, will
 16 | download all available models into the current directory
 17 | 
 18 | OPTIONS
 19 | 
 20 |   -v, --verbose: enable verbose mode
 21 |   -h, --help:    print this help and exit
 22 | 
 23 | EXAMPLES
 24 | 
 25 | Download all models ($ALL_MODELS) into the current directory
 26 | 
 27 |   ./download_community.sh
 28 | 
 29 | Download the 7B and 13B parameter models to /usr/share/llama
 30 |     
 31 |   ./download_community.sh 7B,13B /usr/share/llama
 32 | 
 33 | EOF
 34 |     exit 1
 35 | }
 36 | 
 37 | # print its argument in red and quit 
 38 | function die {
 39 |     printf "%s%s%s\n" "$RED" "$1" "$CLEAR"
 40 |     exit 1
 41 | }
 42 | 
 43 | # print its argument in yellow
 44 | function log {
 45 |     printf "\n%s%s%s\n" "$YELLOW" "$1" "$CLEAR"
 46 | }
 47 | 
 48 | # download a file with a progress bar, then display a success message. Takes
 49 | # two arguments: the URL and the output file name
 50 | function download {
 51 |     if ! wget --continue --progress=bar:force "$1" -O "$2"; then
 52 |         die "failed to download $1 -> $2"
 53 |     fi
 54 |     echo ✅ "$2"
 55 | }
 56 | 
 57 | # change into the model directory and use md5sum -c to verify the checksums of
 58 | # the model files within. Uses a subshell to avoid changing the script's
 59 | # direcotry
 60 | function verify {
 61 |     (cd "$1" && md5sum -c "$2")
 62 | }
 63 | 
 64 | # return the number of shards for a given model. Bash 3 doesn't support
 65 | # associative arrays, so use a case statement instead.
 66 | function nshards {
 67 |     case $1 in
 68 |         7B)
 69 |             echo 0
 70 |             ;;
 71 |         13B)
 72 |             echo 1
 73 |             ;;
 74 |         30B)
 75 |             echo 3
 76 |             ;;
 77 |         65B)
 78 |             echo 7
 79 |             ;;
 80 |         *)
 81 |             die "invalid argument to nshards: $1"
 82 |             ;;
 83 |     esac
 84 | 
 85 | }
 86 | 
 87 | # check for wget - if it's not present print an error
 88 | if ! command -v wget &> /dev/null
 89 | then
 90 |     die "wget not found. You must have wget installed and on your path to run this script"
 91 | fi
 92 | 
 93 | # parse the optional flags and discard them
 94 | while true; do
 95 |     case $1 in
 96 |         -v|--verbose)
 97 |             set -x
 98 |             shift
 99 |             ;;
100 |         -h|--help|help)
101 |             usage
102 |             ;;
103 |         *)
104 |             break
105 |             ;;
106 |     esac
107 | done
108 | 
109 | # MODELS_TO_DOWNLOAD is a comma-separated list of models the user wants to
110 | # download, which defaults to all models. Split it into an array called MODELS
111 | MODELS_TO_DOWNLOAD=${1:-$ALL_MODELS}
112 | IFS="," read -r -a MODELS <<< "$MODELS_TO_DOWNLOAD"
113 | 
114 | # TARGET_FOLDER is the root directory to download the models to
115 | TARGET_FOLDER=${2:-.}
116 | 
117 | log "❤️  Resume download is supported. You can ctrl-c and rerun the program to resume the downloading"
118 | 
119 | # ensure the targeted directory exists
120 | mkdir -p "$TARGET_FOLDER"
121 | 
122 | log "Downloading tokenizer..."
123 | download "$PRESIGNED_URL/tokenizer.model" "$TARGET_FOLDER/tokenizer.model"
124 | download "$PRESIGNED_URL/tokenizer_checklist.chk" "$TARGET_FOLDER/tokenizer_checklist.chk"
125 | verify "$TARGET_FOLDER" tokenizer_checklist.chk
126 | 
127 | # for each model, download each of its shards and then verify the checksums
128 | for model in "${MODELS[@]}"
129 | do
130 |     log "Downloading $model"
131 |     mkdir -p "$TARGET_FOLDER/$model"
132 | 
133 |     # download each shard in the model
134 |     for s in $(seq -f "0%g" 0 "$(nshards "$model")")
135 |     do
136 |        fout="$TARGET_FOLDER/$model/consolidated.$s.pth"
137 |        log "downloading file to $fout ...please wait for a few minutes ..."
138 |        download "$PRESIGNED_URL/$model/consolidated.$s.pth" "$fout"
139 |     done
140 | 
141 |     # download the params and checksums
142 |     download "$PRESIGNED_URL/$model/params.json" "$TARGET_FOLDER/$model/params.json"
143 |     download "$PRESIGNED_URL/$model/checklist.chk" "$TARGET_FOLDER/$model/checklist.chk"
144 | 
145 |     log "Checking checksums for the $model model"
146 |     verify "$TARGET_FOLDER/$model" checklist.chk
147 | done
148 | 


--------------------------------------------------------------------------------
/llama/download_community_stop.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ps aux | grep 'wget --continue --progress=bar:force https://agi.gpt4.org/llama/LLaMA/' | grep -v grep | awk '{print $2}' | xargs kill
3 | ps aux | grep '.*llama/download_community.sh' | grep -v grep | awk '{print $2}' | xargs kill
4 | 


--------------------------------------------------------------------------------
/llama/generation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
  3 | 
  4 | from typing import List
  5 | 
  6 | import torch
  7 | 
  8 | from llama.tokenizer import Tokenizer
  9 | 
 10 | 
 11 | class LLaMA:
 12 |     def __init__(self, model, tokenizer: Tokenizer):
 13 |         self.model = model
 14 |         self.tokenizer = tokenizer
 15 |         
 16 |     def _should_stop(self, tokens, prompt_tokens, stop_ids, stop_words):
 17 |         """credits go to: https://github.com/galatolofederico/vanilla-llama"""
 18 |         if stop_ids is not None:
 19 |             do_stop = [False for _ in range(len(tokens))]
 20 |             for i, (t, p) in enumerate(zip(tokens, prompt_tokens)):
 21 |                 g = t[len(p):].tolist()
 22 |                 for stop_id in stop_ids:
 23 |                     if stop_id in g:
 24 |                         do_stop[i] = True
 25 | 
 26 |             if all(do_stop):
 27 |                 return True
 28 | 
 29 |         if stop_words is not None:
 30 |             do_stop = [False for _ in range(len(tokens))]
 31 |             for i, (t, p) in enumerate(zip(tokens, prompt_tokens)):
 32 |                 t = t.clone()
 33 |                 g = t[len(p):]
 34 |                 g[g == self.tokenizer.pad_id] = self.tokenizer.eos_id
 35 |                 g = g.tolist()
 36 |                 d = self.tokenizer.decode(g)
 37 |                 for stop_word in stop_words:
 38 |                     if stop_word in d:
 39 |                         do_stop[i] = True
 40 | 
 41 |             if all(do_stop):
 42 |                 return True
 43 | 
 44 |         return False
 45 | 
 46 |     def generate(
 47 |         self,
 48 |         prompts: List[str],
 49 |         max_gen_len: int,
 50 |         temperature: float = 0.8,
 51 |         top_p: float = 0.95,
 52 |         stop_ids: List[int] = None,
 53 |         stop_words: List[str] = None,
 54 |     ) -> List[str]:
 55 |         bsz = len(prompts)
 56 |         params = self.model.params
 57 |         assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
 58 | 
 59 |         prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
 60 | 
 61 |         min_prompt_size = min([len(t) for t in prompt_tokens])
 62 |         max_prompt_size = max([len(t) for t in prompt_tokens])
 63 | 
 64 |         total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
 65 | 
 66 |         tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
 67 |         for k, t in enumerate(prompt_tokens):
 68 |             tokens[k, : len(t)] = torch.tensor(t).long()
 69 |         input_text_mask = tokens != self.tokenizer.pad_id
 70 |         start_pos = min_prompt_size
 71 |         prev_pos = 0
 72 |         for cur_pos in range(start_pos, total_len):
 73 |             i = tokens[:, prev_pos:cur_pos]
 74 |             logits = self.model(i, prev_pos)
 75 |             if temperature > 0:
 76 |                 probs = torch.softmax(logits / temperature, dim=-1)
 77 |                 next_token = sample_top_p(probs, top_p)
 78 |             else:
 79 |                 next_token = torch.argmax(logits, dim=-1)
 80 |             next_token = next_token.reshape(-1)
 81 |             # only replace token if prompt has already been generated
 82 |             next_token = torch.where(
 83 |                 input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
 84 |             )
 85 |             tokens[:, cur_pos] = next_token
 86 |             prev_pos = cur_pos
 87 |             
 88 |             if self._should_stop(tokens, prompt_tokens, stop_ids, stop_words):
 89 |                 break
 90 | 
 91 |         tokens[tokens == self.tokenizer.pad_id] = self.tokenizer.eos_id
 92 |         decoded = []
 93 |         for i, t in enumerate(tokens.tolist()):
 94 |             # cut to max gen len
 95 |             t = t[: len(prompt_tokens[i]) + max_gen_len]
 96 |             # cut to eos tok if any
 97 |             try:
 98 |                 t = t[: t.index(self.tokenizer.eos_id)]
 99 |             except ValueError:
100 |                 pass
101 |             decoded.append(self.tokenizer.decode(t))
102 |         #print(decoded)
103 |         return [postprocessing(i, stop_words) for i in decoded]
104 | 
105 | 
106 | def postprocessing(output_text, stop_words=None, threshold=10):
107 |     sentences = output_text.split(".")
108 |     filtered_sentences = []
109 |     for sentence in sentences:
110 |         sentence = sentence.strip()
111 |         if len(sentence) > threshold and sentence[-1] == ".":
112 |             filtered_sentences.append(sentence)
113 |     r = '.'.join(sentences).strip()
114 |     if stop_words:
115 |         for w in stop_words:
116 |             if r.endswith(w):
117 |                 r = r[0:-len(w)].strip()
118 |     if r[-1] != '.':
119 |         r += '...'
120 |     return r
121 | 
122 | 
123 | def sample_top_p(probs, p):
124 |     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
125 |     probs_sum = torch.cumsum(probs_sort, dim=-1)
126 |     mask = probs_sum - probs_sort > p
127 |     probs_sort[mask] = 0.0
128 |     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
129 |     next_token = torch.multinomial(probs_sort, num_samples=1)
130 |     next_token = torch.gather(probs_idx, -1, next_token)
131 |     return next_token
132 | 


--------------------------------------------------------------------------------
/llama/hf/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_torch_available,
20 |     is_sentencepiece_available,
21 | )
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LLaMAConfig"],
26 | }
27 | 
28 | try:
29 |     if not is_sentencepiece_available():
30 |         raise OptionalDependencyNotAvailable()
31 | except OptionalDependencyNotAvailable:
32 |     pass
33 | else:
34 |     _import_structure["tokenization_llama"] = ["LLaMATokenizer"]
35 | 
36 | try:
37 |     if not is_torch_available():
38 |         raise OptionalDependencyNotAvailable()
39 | except OptionalDependencyNotAvailable:
40 |     pass
41 | else:
42 |     _import_structure["modeling_llama"] = [
43 |         "LLaMAForCausalLM",
44 |         "LLaMAModel",
45 |         "LLaMAPreTrainedModel",
46 |     ]
47 | 
48 | 
49 | if TYPE_CHECKING:
50 |     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LLaMAConfig
51 | 
52 |     try:
53 |         if not is_sentencepiece_available():
54 |             raise OptionalDependencyNotAvailable()
55 |     except OptionalDependencyNotAvailable:
56 |         pass
57 |     else:
58 |         from .tokenization_llama import LLaMATokenizer
59 | 
60 |     try:
61 |         if not is_torch_available():
62 |             raise OptionalDependencyNotAvailable()
63 |     except OptionalDependencyNotAvailable:
64 |         pass
65 |     else:
66 |         from .modeling_llama import (
67 |             LLaMAForCausalLM,
68 |             LLaMAModel,
69 |             LLaMAPreTrainedModel,
70 |         )
71 | 
72 | 
73 | else:
74 |     import sys
75 | 
76 |     sys.modules[__name__] = _LazyModule(
77 |         __name__, globals()["__file__"], _import_structure, module_spec=__spec__
78 |     )
79 | 


--------------------------------------------------------------------------------
/llama/hf/configuration_llama.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  5 | # and OPT implementations in this library. It has been modified from its
  6 | # original forms to accommodate minor architectural differences compared
  7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | """ LLaMA model configuration"""
 21 | 
 22 | from transformers.configuration_utils import PretrainedConfig
 23 | from transformers.utils import logging
 24 | 
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 29 | 
 30 | 
 31 | class LLaMAConfig(PretrainedConfig):
 32 |     r"""
 33 |     This is the configuration class to store the configuration of a [`~LLaMAModel`]. It is used to instantiate an LLaMA
 34 |     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
 35 |     defaults will yield a similar configuration to that of the LLaMA-7B.
 36 | 
 37 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 38 |     documentation from [`PretrainedConfig`] for more information.
 39 | 
 40 | 
 41 |     Args:
 42 |         vocab_size (`int`, *optional*, defaults to 32000):
 43 |             Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
 44 |             `inputs_ids` passed when calling [`~LLaMAModel`] or [`~TFLLaMAModel`].
 45 |         hidden_size (`int`, *optional*, defaults to 4096):
 46 |             Dimension of the hidden representations.
 47 |         intermediate_size (`int`, *optional*, defaults to 11008):
 48 |             Dimension of the MLP representations.
 49 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 50 |             Number of hidden layers in the Transformer encoder.
 51 |         num_attention_heads (`int`, *optional*, defaults to 32):
 52 |             Number of attention heads for each attention layer in the Transformer encoder.
 53 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 54 |             The non-linear activation function (function or string) in the decoder.
 55 |         initializer_range (`float`, *optional*, defaults to 0.02):
 56 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 57 |         rms_norm_eps (`float`, *optional*, defaults to 1e-12):
 58 |             The epsilon used by the rms normalization layers.
 59 |         use_cache (`bool`, *optional*, defaults to `True`):
 60 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 61 |             relevant if `config.is_decoder=True`.
 62 |         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
 63 |             Whether to tie weight embeddings
 64 |         Example:
 65 | 
 66 |     ```python
 67 |     >>> from transformers import LLaMAModel, LLaMAConfig
 68 | 
 69 |     >>> # Initializing a LLaMA llama-7b style configuration
 70 |     >>> configuration = LLaMAConfig()
 71 | 
 72 |     >>> # Initializing a model from the llama-7b style configuration
 73 |     >>> model = LLaMAModel(configuration)
 74 | 
 75 |     >>> # Accessing the model configuration
 76 |     >>> configuration = model.config
 77 |     ```"""
 78 |     model_type = "llama"
 79 | 
 80 |     def __init__(
 81 |         self,
 82 |         vocab_size=32000,
 83 |         hidden_size=4096,
 84 |         intermediate_size=11008,
 85 |         num_hidden_layers=32,
 86 |         num_attention_heads=32,
 87 |         hidden_act="silu",
 88 |         initializer_range=0.02,
 89 |         rms_norm_eps=1e-6,
 90 |         use_cache=True,
 91 |         pad_token_id=-1,
 92 |         bos_token_id=0,
 93 |         eos_token_id=1,
 94 |         tie_word_embeddings=False,
 95 |         **kwargs,
 96 |     ):
 97 |         self.vocab_size = vocab_size
 98 |         self.hidden_size = hidden_size
 99 |         self.intermediate_size = intermediate_size
100 |         self.num_hidden_layers = num_hidden_layers
101 |         self.num_attention_heads = num_attention_heads
102 |         self.hidden_act = hidden_act
103 |         self.initializer_range = initializer_range
104 |         self.rms_norm_eps = rms_norm_eps
105 |         self.use_cache = use_cache
106 |         super().__init__(
107 |             pad_token_id=pad_token_id,
108 |             bos_token_id=bos_token_id,
109 |             eos_token_id=eos_token_id,
110 |             tie_word_embeddings=tie_word_embeddings,
111 |             **kwargs,
112 |         )
113 | 


--------------------------------------------------------------------------------
/llama/hf/tokenization_llama.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  4 | # and OPT implementations in this library. It has been modified from its
  5 | # original forms to accommodate minor architectural differences compared
  6 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  7 | #
  8 | # Licensed under the Apache License, Version 2.0 (the "License");
  9 | # you may not use this file except in compliance with the License.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | 
 20 | """Tokenization classes for LLaMA."""
 21 | import os
 22 | import re
 23 | from shutil import copyfile
 24 | from typing import Any, Dict, List, Optional, Tuple
 25 | 
 26 | import sentencepiece as spm
 27 | 
 28 | from transformers.tokenization_utils import PreTrainedTokenizer
 29 | from transformers.utils import logging
 30 | 
 31 | 
 32 | logger = logging.get_logger(__name__)
 33 | 
 34 | VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 35 | 
 36 | PRETRAINED_VOCAB_FILES_MAP = {}
 37 | 
 38 | 
 39 | class LLaMATokenizer(PreTrainedTokenizer):
 40 |     """
 41 |     Construct a LLaMA tokenizer. Based on byte-level Byte-Pair-Encoding.
 42 | 
 43 |     Args:
 44 |         vocab_file (`str`):
 45 |             Path to the vocabulary file.
 46 |     """
 47 | 
 48 |     vocab_files_names = VOCAB_FILES_NAMES
 49 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
 50 |     model_input_names = ["input_ids", "attention_mask"]
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         vocab_file,
 55 |         unk_token="",
 56 |         bos_token=" ⁇ ",
 57 |         eos_token="",
 58 |         sp_model_kwargs: Optional[Dict[str, Any]] = None,
 59 |         add_bos_token=True,
 60 |         add_eos_token=False,
 61 |         **kwargs,
 62 |     ):
 63 |         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 64 |         super().__init__(
 65 |             bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
 66 |         )
 67 |         self.vocab_file = vocab_file
 68 |         self.add_bos_token = add_bos_token
 69 |         self.add_eos_token = add_eos_token
 70 |         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
 71 |         self.sp_model.Load(vocab_file)
 72 | 
 73 |         """ Initialisation"""
 74 | 
 75 |     @property
 76 |     def vocab_size(self):
 77 |         """Returns vocab size"""
 78 |         return self.sp_model.get_piece_size()
 79 | 
 80 |     @property
 81 |     def bos_token_id(self) -> Optional[int]:
 82 |         return self.sp_model.bos_id()
 83 | 
 84 |     @property
 85 |     def eos_token_id(self) -> Optional[int]:
 86 |         return self.sp_model.eos_id()
 87 | 
 88 |     def get_vocab(self):
 89 |         """Returns vocab as a dict"""
 90 |         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
 91 |         vocab.update(self.added_tokens_encoder)
 92 |         return vocab
 93 | 
 94 |     def _tokenize(self, text):
 95 |         """Returns a tokenized string."""
 96 |         return self.sp_model.encode(text, out_type=str)
 97 | 
 98 |     def _convert_token_to_id(self, token):
 99 |         """Converts a token (str) in an id using the vocab."""
100 |         return self.sp_model.piece_to_id(token)
101 | 
102 |     def _convert_id_to_token(self, index):
103 |         """Converts an index (integer) in a token (str) using the vocab."""
104 |         token = self.sp_model.IdToPiece(index)
105 |         return token
106 | 
107 |     def convert_tokens_to_string(self, tokens):
108 |         """Converts a sequence of tokens (string) in a single string."""
109 |         current_sub_tokens = []
110 |         out_string = ""
111 |         prev_is_special = False
112 |         for token in tokens:
113 |             # make sure that special tokens are not decoded using sentencepiece model
114 |             if token in self.all_special_tokens:
115 |                 if not prev_is_special:
116 |                     out_string += " "
117 |                 out_string += self.sp_model.decode(current_sub_tokens) + token
118 |                 prev_is_special = True
119 |                 current_sub_tokens = []
120 |             else:
121 |                 current_sub_tokens.append(token)
122 |                 prev_is_special = False
123 |         out_string += self.sp_model.decode(current_sub_tokens)
124 |         return out_string.strip()
125 | 
126 |     def save_vocabulary(
127 |         self, save_directory, filename_prefix: Optional[str] = None
128 |     ) -> Tuple[str]:
129 |         """
130 |         Save the vocabulary and special tokens file to a directory.
131 | 
132 |         Args:
133 |             save_directory (`str`):
134 |                 The directory in which to save the vocabulary.
135 | 
136 |         Returns:
137 |             `Tuple(str)`: Paths to the files saved.
138 |         """
139 |         if not os.path.isdir(save_directory):
140 |             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
141 |             return
142 |         out_vocab_file = os.path.join(
143 |             save_directory,
144 |             (filename_prefix + "-" if filename_prefix else "")
145 |             + VOCAB_FILES_NAMES["vocab_file"],
146 |         )
147 | 
148 |         if os.path.abspath(self.vocab_file) != os.path.abspath(
149 |             out_vocab_file
150 |         ) and os.path.isfile(self.vocab_file):
151 |             copyfile(self.vocab_file, out_vocab_file)
152 |         elif not os.path.isfile(self.vocab_file):
153 |             with open(out_vocab_file, "wb") as fi:
154 |                 content_spiece_model = self.sp_model.serialized_model_proto()
155 |                 fi.write(content_spiece_model)
156 | 
157 |         return (out_vocab_file,)
158 | 
159 |     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
160 |         if self.add_bos_token:
161 |             bos_token_ids = [self.bos_token_id]
162 |         else:
163 |             bos_token_ids = []
164 | 
165 |         output = bos_token_ids + token_ids_0
166 | 
167 |         if token_ids_1 is not None:
168 |             output = output + token_ids_1
169 | 
170 |         if self.add_eos_token:
171 |             output = output + [self.eos_token_id]
172 | 
173 |         return output
174 | 
175 |     def get_special_tokens_mask(
176 |         self,
177 |         token_ids_0: List[int],
178 |         token_ids_1: Optional[List[int]] = None,
179 |         already_has_special_tokens: bool = False,
180 |     ) -> List[int]:
181 |         """
182 |         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
183 |         special tokens using the tokenizer `prepare_for_model` method.
184 | 
185 |         Args:
186 |             token_ids_0 (`List[int]`):
187 |                 List of IDs.
188 |             token_ids_1 (`List[int]`, *optional*):
189 |                 Optional second list of IDs for sequence pairs.
190 |             already_has_special_tokens (`bool`, *optional*, defaults to `False`):
191 |                 Whether or not the token list is already formatted with special tokens for the model.
192 | 
193 |         Returns:
194 |             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
195 |         """
196 |         if already_has_special_tokens:
197 |             return super().get_special_tokens_mask(
198 |                 token_ids_0=token_ids_0,
199 |                 token_ids_1=token_ids_1,
200 |                 already_has_special_tokens=True,
201 |             )
202 | 
203 |         if token_ids_1 is None:
204 |             return [1] + ([0] * len(token_ids_0)) + [1]
205 |         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
206 | 
207 |     def create_token_type_ids_from_sequences(
208 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
209 |     ) -> List[int]:
210 |         """
211 |         Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
212 |         use of token type ids, therefore a list of zeros is returned.
213 | 
214 |         Args:
215 |             token_ids_0 (`List[int]`):
216 |                 List of IDs.
217 |             token_ids_1 (`List[int]`, *optional*):
218 |                 Optional second list of IDs for sequence pairs.
219 | 
220 |         Returns:
221 |             `List[int]`: List of zeros.
222 |         """
223 |         eos = [self.eos_token_id]
224 | 
225 |         if token_ids_1 is None:
226 |             return len(token_ids_0 + eos) * [0]
227 |         return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
228 | 


--------------------------------------------------------------------------------
/llama/hf/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .modeling_llama import LLaMAForCausalLM
 3 | 
 4 | 
 5 | def non_ops(*args, **kwargs):
 6 |     pass
 7 | 
 8 | 
 9 | def avoid_tensor_modified():
10 |     torch.nn.init.kaiming_uniform_ = non_ops
11 |     torch.nn.init.uniform_ = non_ops
12 |     torch.nn.init.normal_ = non_ops
13 | 
14 | 
15 | def get_llama(model, seqlen=1024):
16 |     avoid_tensor_modified()
17 |     model = LLaMAForCausalLM.from_pretrained(model, torch_dtype="auto")
18 |     model.seqlen = seqlen
19 |     return model
20 | 


--------------------------------------------------------------------------------
/llama/llama_infer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from llama.hf import LLaMATokenizer
 4 | from llama.hf.utils import get_llama
 5 | from llama.llama_quant import load_quant
 6 | 
 7 | 
 8 | def get_args():
 9 |     import argparse
10 | 
11 |     parser = argparse.ArgumentParser()
12 | 
13 |     parser.add_argument("--model", type=str, default="decapoda-research/llama-7b-hf", help="llama model to load")
14 |     parser.add_argument(
15 |         "--wbits",
16 |         type=int,
17 |         default=16,
18 |         choices=[2, 3, 4, 8, 16],
19 |         help="#bits to use for quantization; use 16 for evaluating base model.",
20 |     )
21 |     parser.add_argument("--load", type=str, default="", help="Load quantized model.")
22 |     parser.add_argument("--text", type=str, help="input text")
23 |     parser.add_argument(
24 |         "--min_length",
25 |         type=int,
26 |         default=10,
27 |         help="The minimum length of the sequence to be generated.",
28 |     )
29 |     parser.add_argument(
30 |         "--seqlen",
31 |         type=int,
32 |         default=1024,
33 |         help="The  maximum length of the input sequence that LLaMA can process.",
34 |     )
35 |     parser.add_argument(
36 |         "--max_length",
37 |         type=int,
38 |         default=50,
39 |         help="The maximum length of the output sequence to be generated.",
40 |     )
41 | 
42 |     parser.add_argument(
43 |         "--top_p",
44 |         type=float,
45 |         default=0.95,
46 |         help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
47 |     )
48 | 
49 |     parser.add_argument(
50 |         "--temperature",
51 |         type=float,
52 |         default=0.8,
53 |         help="The value used to module the next token probabilities.",
54 |     )
55 |     parser.add_argument(
56 |         "--cuda", type=str, default="cuda:0", help="GPU device string, eg cuda:0."
57 |     )
58 |     args = parser.parse_args()
59 |     return args
60 | 
61 | 
62 | def run(args=None):
63 |     args = args or get_args()
64 |     if args.load:
65 |         model = load_quant(args.model, args.load, args.wbits, args.seqlen)
66 |     else:
67 |         model = get_llama(args.model)
68 |         model.eval()
69 |     if args.cuda.startswith("cuda"):
70 |         dev = torch.device(args.cuda)
71 |     else:
72 |         dev = torch.device("cpu")
73 | 
74 |     model.to(dev)
75 |     tokenizer = LLaMATokenizer.from_pretrained(args.model)
76 |     input_ids = tokenizer.encode(args.text, return_tensors="pt").to(dev)
77 | 
78 |     with torch.no_grad():
79 |         generated_ids = model.generate(
80 |             input_ids,
81 |             do_sample=True,
82 |             min_length=args.min_length,
83 |             max_length=args.max_length,
84 |             top_p=args.top_p,
85 |             temperature=args.temperature,
86 |         )
87 |     print("*"*80)
88 |     print("🦙:", tokenizer.decode([el.item() for el in generated_ids[0]]))
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     run()
93 | 


--------------------------------------------------------------------------------
/llama/llama_multigpu.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import torch
 4 | from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 5 | from tqdm import tqdm
 6 | from pathlib import Path
 7 | import hiq
 8 | import os
 9 | from llama import ModelArgs, Tokenizer, Transformer, LLaMA
10 | 
11 | NUM_SHARDS = {
12 |     "7B": 1,
13 |     "13B": 2,
14 |     "30B": 4,
15 |     "65B": 8,
16 | }
17 | 
18 | class LLaMAInference:
19 |     def __init__(self, state_dict_dir, model_size, device_map="auto", **kwargs):
20 | 
21 |         state_dict = os.path.join(state_dict_dir, model_size, "state_dict.pt")
22 |         params_file = os.path.join(state_dict_dir, model_size, "params.json")
23 |         tokenizer_path = os.path.join(state_dict_dir, "tokenizer.model")
24 |         params = hiq.read_file(params_file, as_json=True)
25 | 
26 |         model_args = dict(
27 |             max_seq_len=2048,
28 |             max_batch_size=1,
29 |             **params
30 |         )
31 |         model_args.update(kwargs)
32 |         model_args = ModelArgs(**model_args)
33 | 
34 |         self.tokenizer = Tokenizer(model_path=tokenizer_path)
35 |         model_args.vocab_size = self.tokenizer.n_words
36 | 
37 |         with init_empty_weights():
38 |             torch.set_default_tensor_type(torch.HalfTensor)
39 |             model = Transformer(model_args)
40 |         torch.set_default_tensor_type(torch.FloatTensor)
41 | 
42 |         self.model = load_checkpoint_and_dispatch(
43 |             model,
44 |             state_dict,
45 |             device_map=device_map,
46 |             no_split_module_classes=["TransformerBlock"]
47 |         )
48 | 
49 |         self.generator = LLaMA(self.model, self.tokenizer)
50 | 
51 |     def generate(self, texts, temperature=0.8, top_p=0.95, max_length=256, stop_ids=None, stop_words=None):
52 |         results = self.generator.generate(
53 |             texts,
54 |             max_gen_len=max_length,
55 |             temperature=temperature,
56 |             top_p=top_p,
57 |             stop_ids=stop_ids,
58 |             stop_words=stop_words
59 |         )
60 |         return results
61 | 
62 | def get_args():
63 |     import argparse
64 | 
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--state_dict_dir", type=str, default="/llama_data/7B")
67 |     parser.add_argument(
68 |         "--model_size",
69 |         choices=NUM_SHARDS.keys(),
70 |     )
71 |     return parser.parse_args()
72 | 
73 | if __name__ == "__main__":
74 |     args = get_args()
75 |     i = LLaMAInference(args.state_dict_dir, args.model_size)
76 |     results = i.generate(["The meaning of life is"])
77 |     for result in results:
78 |         print("🦙LLaMA:", result.strip())
79 |   
80 |   
81 |     results = i.generate(["Question: why apple drops from the tree when it is ripe?\nAnswer:"],
82 |                           stop_words=["Question"])
83 |     for result in results:
84 |         print("🦙LLaMA:", result.strip())
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/llama/llama_quant.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from gptq import (
  7 |     GPTQ,
  8 |     Quantizer,
  9 |     find_layers,
 10 |     make_quant,
 11 |     QuantLinear,
 12 |     get_loaders,
 13 |     quantize,
 14 | )
 15 | 
 16 | from llama.hf import LLaMAForCausalLM, LLaMATokenizer, LLaMAConfig
 17 | from llama.hf.utils import avoid_tensor_modified, get_llama
 18 | 
 19 | 
 20 | @torch.no_grad()
 21 | def llama_sequential(model, dataloader, args, dev):
 22 |     use_cache = model.config.use_cache
 23 |     model.config.use_cache = False
 24 |     layers = model.model.layers
 25 | 
 26 |     model.model.embed_tokens = model.model.embed_tokens.to(dev)
 27 |     model.model.norm = model.model.norm.to(dev)
 28 |     layers[0] = layers[0].to(dev)
 29 | 
 30 |     dtype = next(iter(model.parameters())).dtype
 31 |     inps = torch.zeros(
 32 |         (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
 33 |     )
 34 |     cache = {"i": 0, "attention_mask": None}
 35 | 
 36 |     class Catcher(nn.Module):
 37 |         def __init__(self, module):
 38 |             super().__init__()
 39 |             self.module = module
 40 | 
 41 |         def forward(self, inp, **kwargs):
 42 |             # print("kwargs:", kwargs.keys())
 43 |             inps[cache["i"]] = inp
 44 |             cache["i"] += 1
 45 |             cache["attention_mask"] = kwargs["attention_mask"]
 46 |             raise ValueError
 47 | 
 48 |     layers[0] = Catcher(layers[0])
 49 |     for batch in dataloader:
 50 |         try:
 51 |             i = batch[0].to(dev)
 52 |             model(i)
 53 |         except ValueError:
 54 |             pass
 55 |     layers[0] = layers[0].module
 56 | 
 57 |     layers[0] = layers[0].cpu()
 58 |     model.model.embed_tokens = model.model.embed_tokens.cpu()
 59 |     model.model.norm = model.model.norm.cpu()
 60 |     torch.cuda.empty_cache()
 61 | 
 62 |     outs = torch.zeros_like(inps)
 63 |     attention_mask = cache["attention_mask"]
 64 | 
 65 |     quantizers = {}
 66 |     for i in range(len(layers)):
 67 |         layer = layers[i].to(dev)
 68 |         subset = find_layers(layer)
 69 |         name_to_gptq = {}
 70 |         for name in subset:
 71 |             name_to_gptq[name] = GPTQ(subset[name])
 72 |             name_to_gptq[name].quantizer = Quantizer()
 73 |             name_to_gptq[name].quantizer.configure(
 74 |                 args.wbits, perchannel=True, sym=False, mse=False
 75 |             )
 76 | 
 77 |         def add_batch(name):
 78 |             def tmp(_, inp, out):
 79 |                 name_to_gptq[name].add_batch(inp[0].data, out.data)
 80 | 
 81 |             return tmp
 82 | 
 83 |         handles = []
 84 |         for name in subset:
 85 |             handles.append(subset[name].register_forward_hook(add_batch(name)))
 86 |         for j in range(args.nsamples):
 87 |             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 88 |         for h in handles:
 89 |             h.remove()
 90 |         print(f"\nQuantize layer: {i} ", end=',')
 91 |         for name in subset:
 92 |             print(name, end=",")
 93 |             name_to_gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
 94 |             quantizers["model.layers.%d.%s" % (i, name)] = name_to_gptq[name].quantizer
 95 |             name_to_gptq[name].free()
 96 |         for j in range(args.nsamples):
 97 |             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 98 | 
 99 |         layers[i] = layer.cpu()
100 |         del layer
101 |         del name_to_gptq
102 |         torch.cuda.empty_cache()
103 | 
104 |         inps, outs = outs, inps
105 | 
106 |     model.config.use_cache = use_cache
107 |     return quantizers
108 | 
109 | 
110 | @torch.no_grad()
111 | def llama_eval(model, testenc, args, dev):
112 |     print("Evaluating ...")
113 | 
114 |     testenc = testenc.input_ids
115 |     nsamples = testenc.numel() // model.seqlen
116 | 
117 |     use_cache = model.config.use_cache
118 |     model.config.use_cache = False
119 |     layers = model.model.layers
120 | 
121 |     model.model.embed_tokens = model.model.embed_tokens.to(dev)
122 |     layers[0] = layers[0].to(dev)
123 | 
124 |     dtype = next(iter(model.parameters())).dtype
125 |     inps = torch.zeros(
126 |         (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
127 |     )
128 |     cache = {"i": 0, "attention_mask": None}
129 | 
130 |     class Catcher(nn.Module):
131 |         def __init__(self, module):
132 |             super().__init__()
133 |             self.module = module
134 | 
135 |         def forward(self, inp, **kwargs):
136 |             inps[cache["i"]] = inp
137 |             cache["i"] += 1
138 |             cache["attention_mask"] = kwargs["attention_mask"]
139 |             raise ValueError
140 | 
141 |     layers[0] = Catcher(layers[0])
142 |     for i in range(nsamples):
143 |         batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev)
144 |         try:
145 |             model(batch)
146 |         except ValueError:
147 |             pass
148 |     layers[0] = layers[0].module
149 | 
150 |     layers[0] = layers[0].cpu()
151 |     model.model.embed_tokens = model.model.embed_tokens.cpu()
152 |     torch.cuda.empty_cache()
153 | 
154 |     outs = torch.zeros_like(inps)
155 |     attention_mask = cache["attention_mask"]
156 | 
157 |     for i in range(len(layers)):
158 |         print(i)
159 |         layer = layers[i].to(dev)
160 | 
161 |         if args.nearest:
162 |             subset = find_layers(layer)
163 |             for name in subset:
164 |                 quantizer = Quantizer()
165 |                 quantizer.configure(args.wbits, perchannel=True, sym=False, mse=False)
166 |                 W = subset[name].weight.data
167 |                 quantizer.find_params(W, weight=True)
168 |                 subset[name].weight.data = quantize(
169 |                     W, quantizer.scale, quantizer.zero, quantizer.maxq
170 |                 ).to(next(iter(layer.parameters())).dtype)
171 | 
172 |         for j in range(nsamples):
173 |             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
174 |         layers[i] = layer.cpu()
175 |         del layer
176 |         torch.cuda.empty_cache()
177 |         inps, outs = outs, inps
178 | 
179 |     if model.model.norm is not None:
180 |         model.model.norm = model.model.norm.to(dev)
181 |     model.lm_head = model.lm_head.to(dev)
182 | 
183 |     testenc = testenc.to(dev)
184 |     nlls = []
185 |     for i in range(nsamples):
186 |         hidden_states = inps[i].unsqueeze(0)
187 |         if model.model.norm is not None:
188 |             hidden_states = model.model.norm(hidden_states)
189 |         lm_logits = model.lm_head(hidden_states)
190 |         shift_logits = lm_logits[:, :-1, :].contiguous()
191 |         shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
192 |         loss_fct = nn.CrossEntropyLoss()
193 |         loss = loss_fct(
194 |             shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
195 |         )
196 |         neg_log_likelihood = loss.float() * model.seqlen
197 |         nlls.append(neg_log_likelihood)
198 |     ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
199 |     print(ppl.item())
200 | 
201 |     model.config.use_cache = use_cache
202 | 
203 | 
204 | # TODO: perform packing on GPU
205 | def llama_pack(model, quantizers, wbits):
206 |     layers = find_layers(model)
207 |     layers = {n: layers[n] for n in quantizers}
208 |     make_quant(model, quantizers, wbits)
209 |     qlayers = find_layers(model, [QuantLinear])
210 |     for name in qlayers:
211 |         print(name)
212 |         quantizers[name] = quantizers[name].cpu()
213 |         qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
214 |     return model
215 | 
216 | 
217 | def load_quant(model_name, checkpoint, wbits, seqlen=1024, for_infer=True):
218 |     """
219 |     seqlen - seqlen refers to the maximum length of the input sequence that the model can process. The input sequence can be a sequence of words, tokens, or characters, depending on how the model is tokenized. The seqlen parameter is important because it determines the amount of memory that the model requires to process the input sequence. If the input sequence is too long, it may exceed the memory capacity of the model, leading to out-of-memory errors or slower inference times. In order to handle longer sequences, some models use techniques such as attention masking or truncation, which allow the model to process only a portion of the input sequence at a time. The seqlen parameter determines the maximum length of the input sequence that can be processed in a single step. If the input sequence is longer than the seqlen parameter, it may need to be split into multiple segments and processed separately.
220 |     """
221 |     import transformers
222 | 
223 |     config = LLaMAConfig.from_pretrained(model_name)
224 |     avoid_tensor_modified()
225 | 
226 |     transformers.modeling_utils._init_weights = False
227 |     torch.set_default_dtype(torch.half)
228 |     model = LLaMAForCausalLM(config)
229 |     torch.set_default_dtype(torch.float)
230 |     if for_infer:
231 |         model = model.eval()
232 |     layers = find_layers(model)
233 |     for name in ["lm_head"]:
234 |         if name in layers:
235 |             del layers[name]
236 |     make_quant(model, layers, wbits)
237 | 
238 |     print(f"⌛️ Loading model from {checkpoint}...")
239 |     model.load_state_dict(torch.load(checkpoint))
240 |     model.seqlen = seqlen
241 |     print(f"✅ Model from {checkpoint} is loaded successfully.")
242 | 
243 |     return model
244 | 
245 | 
246 | def llama_multigpu(model, gpus):
247 |     """A model parallelism implementation for LLaMA"""
248 |     import math
249 |     import copy
250 | 
251 |     model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
252 |     if hasattr(model.model, "norm") and model.model.norm:
253 |         model.model.norm = model.model.norm.to(gpus[-1])
254 | 
255 |     model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
256 | 
257 |     cache = {"mask": None}
258 | 
259 |     class MoveModule(nn.Module):
260 |         def __init__(self, module):
261 |             super().__init__()
262 |             self.module = module
263 |             self.dev = next(iter(self.module.parameters())).device
264 | 
265 |         def forward(self, *inp, **kwargs):
266 |             inp = list(inp)
267 |             if inp[0].device != self.dev:
268 |                 inp[0] = inp[0].to(self.dev)
269 |             if cache["mask"] is None or cache["mask"].device != self.dev:
270 |                 cache["mask"] = kwargs["attention_mask"].to(self.dev)
271 |             kwargs["attention_mask"] = cache["mask"]
272 |             tmp = self.module(*inp, **kwargs)
273 |             return tmp
274 | 
275 |     layers = model.model.layers
276 |     pergpu = math.ceil(len(layers) / len(gpus))
277 |     for i in range(len(layers)):
278 |         layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
279 | 
280 |     model.gpus = gpus
281 | 
282 | 
283 | def run_benchmark(model, input_ids, check=False, dev=torch.device("cuda:0")):
284 |     input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else dev)
285 |     torch.cuda.synchronize()
286 | 
287 |     cache = {"past": None}
288 | 
289 |     def clear_past(i):
290 |         def tmp(layer, inp, out):
291 |             if cache["past"]:
292 |                 cache["past"][i] = None
293 | 
294 |         return tmp
295 | 
296 |     for i, layer in enumerate(model.model.layers):
297 |         layer.register_forward_hook(clear_past(i))
298 | 
299 |     print("Benchmarking ...")
300 | 
301 |     if check:
302 |         loss = nn.CrossEntropyLoss()
303 |         tot = 0.0
304 | 
305 |     def sync():
306 |         if hasattr(model, "gpus"):
307 |             for gpu in model.gpus:
308 |                 torch.cuda.synchronize(gpu)
309 |         else:
310 |             torch.cuda.synchronize()
311 | 
312 |     max_memory = 0
313 |     with torch.no_grad():
314 |         attention_mask = torch.ones((1, input_ids.numel()), device=dev)
315 |         times = []
316 |         for i in range(input_ids.numel()):
317 |             tick = time.time()
318 |             out = model(
319 |                 input_ids[:, i].reshape(-1),
320 |                 past_key_values=cache["past"],
321 |                 attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)),
322 |             )
323 |             sync()
324 |             times.append(time.time() - tick)
325 |             print(i, times[-1])
326 |             max_memory = max(max_memory, torch.cuda.memory_allocated() / 1024 / 1024)
327 |             if check and i != input_ids.numel() - 1:
328 |                 tot += loss(
329 |                     out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)
330 |                 ).float()
331 |             cache["past"] = list(out.past_key_values)
332 |             del out
333 |         sync()
334 |         import numpy as np
335 | 
336 |         print("Median:", np.median(times))
337 |         if check:
338 |             print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item())
339 |             print("max memory(MiB):", max_memory)
340 | 
341 | 
342 | def get_args():
343 |     import argparse
344 | 
345 |     parser = argparse.ArgumentParser()
346 |     parser.add_argument(
347 |         "model",
348 |         type=str,
349 |         help="llama model to load",
350 |         default="decapoda-research/llama-7b-hf",
351 |     )
352 |     parser.add_argument(
353 |         "dataset",
354 |         type=str,
355 |         choices=["wikitext2", "ptb", "c4"],
356 |         help="Where to extract calibration data from.",
357 |     )
358 |     parser.add_argument("--ckpt_dir", type=str, default="/llama_data/7B")
359 |     parser.add_argument(
360 |         "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
361 |     )
362 |     parser.add_argument(
363 |         "--seed", type=int, default=0, help="Seed for sampling the calibration data."
364 |     )
365 |     parser.add_argument(
366 |         "--nsamples", type=int, default=128, help="Number of calibration data samples."
367 |     )
368 |     parser.add_argument(
369 |         "--percdamp",
370 |         type=float,
371 |         default=0.01,
372 |         help="Percent of the average Hessian diagonal to use for dampening.",
373 |     )
374 |     parser.add_argument(
375 |         "--nearest", action="store_true", help="Whether to run the RTN baseline."
376 |     )
377 |     parser.add_argument(
378 |         "--wbits",
379 |         type=int,
380 |         default=16,
381 |         choices=[2, 3, 4, 8, 16],
382 |         help="#bits to use for quantization; use 16 for evaluating base model.",
383 |     )
384 |     parser.add_argument(
385 |         "--groupsize",
386 |         type=int,
387 |         default=-1,
388 |         help="Groupsize to use for quantization; default uses full row.",
389 |     )
390 |     parser.add_argument(
391 |         "--save",
392 |         type=str,
393 |         default="",
394 |         help="Save quantized checkpoint under this name, eg pyllama-7B4b.pt.",
395 |     )
396 |     parser.add_argument("--load", type=str, default="", help="Load quantized model.")
397 |     parser.add_argument(
398 |         "--benchmark",
399 |         type=int,
400 |         default=0,
401 |         help="Number of tokens to use for benchmarking.",
402 |     )
403 |     parser.add_argument(
404 |         "--check",
405 |         action="store_true",
406 |         help="Whether to compute perplexity during benchmarking for verification.",
407 |     )
408 |     parser.add_argument(
409 |         "--cuda",
410 |         type=str,
411 |         default="cuda:0",
412 |         help="GPU device string, 'cuda:0' by default.",
413 |     )
414 |     parser.add_argument(
415 |         "--eval",
416 |         action="store_false",
417 |         help="Evaluate the model with dataset wikitext2, ptb and c4",
418 |     )
419 | 
420 |     args = parser.parse_args()
421 |     return args
422 | 
423 | 
424 | def run(args=None):
425 |     args = args or get_args()
426 |     if args.load:
427 |         model = load_quant(args.model, args.load, args.wbits)
428 |     else:
429 |         model = get_llama(args.model)
430 |         model.eval()
431 |     if args.cuda.startswith("cuda"):
432 |         dev = torch.device(args.cuda)
433 |     else:
434 |         dev = torch.device("cpu")
435 | 
436 |     tokenizer = LLaMATokenizer.from_pretrained(
437 |         args.model, add_eos_token=True
438 |     )
439 |     dataloader, testloader = get_loaders(
440 |         args.dataset,
441 |         nsamples=args.nsamples,
442 |         seed=args.seed,
443 |         model=args.model,
444 |         seqlen=model.seqlen,
445 |         tokenizer=tokenizer
446 |     )
447 | 
448 |     if not args.load and args.wbits < 16 and not args.nearest:
449 |         quantizers = llama_sequential(model, dataloader, args, dev)
450 | 
451 |     if args.benchmark:
452 |         gpus = [torch.device("cuda:%d" % i) for i in range(torch.cuda.device_count())]
453 |         if len(gpus) > 1:
454 |             llama_multigpu(model, gpus)
455 |         else:
456 |             model = model.to(dev)
457 |         if args.benchmark:
458 |             input_ids = next(iter(dataloader))[0][:, : args.benchmark]
459 |             run_benchmark(model, input_ids, check=args.check)
460 |     if args.load:
461 |         exit()
462 | 
463 |     if args.save:
464 |         llama_pack(model, quantizers, args.wbits)
465 |         torch.save(model.state_dict(), args.save)
466 | 
467 |     if args.eval:
468 |         for dataset in ["wikitext2", "ptb", "c4"]:
469 |             dataloader, testloader = get_loaders(
470 |                 dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, tokenizer=tokenizer
471 |             )
472 |             print(dataset)
473 |             llama_eval(model, testloader, args, dev)
474 | 
475 | 
476 | if __name__ == "__main__":
477 |     run()
478 | 


--------------------------------------------------------------------------------
/llama/model_parallel.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
  3 | 
  4 | from typing import Optional, Tuple
  5 | from dataclasses import dataclass
  6 | import math
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | import torch.nn.functional as F
 11 | 
 12 | import fairscale.nn.model_parallel.initialize as fs_init
 13 | from fairscale.nn.model_parallel.layers import (
 14 |     ParallelEmbedding,
 15 |     RowParallelLinear,
 16 |     ColumnParallelLinear,
 17 | )
 18 | 
 19 | 
 20 | @dataclass
 21 | class ModelArgs:
 22 |     dim: int = 512
 23 |     n_layers: int = 8
 24 |     n_heads: int = 8
 25 |     vocab_size: int = -1  # defined later by tokenizer
 26 |     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
 27 |     norm_eps: float = 1e-5
 28 | 
 29 |     max_batch_size: int = 32
 30 |     max_seq_len: int = 2048
 31 | 
 32 | 
 33 | class RMSNorm(torch.nn.Module):
 34 |     def __init__(self, dim: int, eps: float = 1e-6):
 35 |         super().__init__()
 36 |         self.eps = eps
 37 |         self.weight = nn.Parameter(torch.ones(dim))
 38 | 
 39 |     def _norm(self, x):
 40 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 41 | 
 42 |     def forward(self, x):
 43 |         output = self._norm(x.float()).type_as(x)
 44 |         return output * self.weight
 45 | 
 46 | 
 47 | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
 48 |     freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
 49 |     t = torch.arange(end, device=freqs.device)  # type: ignore
 50 |     freqs = torch.outer(t, freqs).float()  # type: ignore
 51 |     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
 52 |     return freqs_cis
 53 | 
 54 | 
 55 | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
 56 |     ndim = x.ndim
 57 |     assert 0 <= 1 < ndim
 58 |     assert freqs_cis.shape == (x.shape[1], x.shape[-1])
 59 |     shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
 60 |     return freqs_cis.view(*shape)
 61 | 
 62 | 
 63 | def apply_rotary_emb(
 64 |     xq: torch.Tensor,
 65 |     xk: torch.Tensor,
 66 |     freqs_cis: torch.Tensor,
 67 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 68 |     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
 69 |     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
 70 |     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
 71 |     xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
 72 |     xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
 73 |     return xq_out.type_as(xq), xk_out.type_as(xk)
 74 | 
 75 | 
 76 | class Attention(nn.Module):
 77 |     def __init__(self, args: ModelArgs):
 78 |         super().__init__()
 79 | 
 80 |         self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
 81 |         self.head_dim = args.dim // args.n_heads
 82 | 
 83 |         self.wq = ColumnParallelLinear(
 84 |             args.dim,
 85 |             args.n_heads * self.head_dim,
 86 |             bias=False,
 87 |             gather_output=False,
 88 |             init_method=lambda x: x,
 89 |         )
 90 |         self.wk = ColumnParallelLinear(
 91 |             args.dim,
 92 |             args.n_heads * self.head_dim,
 93 |             bias=False,
 94 |             gather_output=False,
 95 |             init_method=lambda x: x,
 96 |         )
 97 |         self.wv = ColumnParallelLinear(
 98 |             args.dim,
 99 |             args.n_heads * self.head_dim,
100 |             bias=False,
101 |             gather_output=False,
102 |             init_method=lambda x: x,
103 |         )
104 |         self.wo = RowParallelLinear(
105 |             args.n_heads * self.head_dim,
106 |             args.dim,
107 |             bias=False,
108 |             input_is_parallel=True,
109 |             init_method=lambda x: x,
110 |         )
111 | 
112 |         self.cache_k = torch.zeros(
113 |             (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
114 |         ).cuda()
115 |         self.cache_v = torch.zeros(
116 |             (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
117 |         ).cuda()
118 | 
119 |     def forward(
120 |         self,
121 |         x: torch.Tensor,
122 |         start_pos: int,
123 |         freqs_cis: torch.Tensor,
124 |         mask: Optional[torch.Tensor],
125 |     ):
126 |         bsz, seqlen, _ = x.shape
127 |         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
128 | 
129 |         xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
130 |         xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
131 |         xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
132 | 
133 |         xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
134 | 
135 |         self.cache_k = self.cache_k.to(xq)
136 |         self.cache_v = self.cache_v.to(xq)
137 | 
138 |         self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
139 |         self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
140 | 
141 |         keys = self.cache_k[:bsz, : start_pos + seqlen]
142 |         values = self.cache_v[:bsz, : start_pos + seqlen]
143 | 
144 |         xq = xq.transpose(1, 2)
145 |         keys = keys.transpose(1, 2)
146 |         values = values.transpose(1, 2)
147 |         scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
148 |         if mask is not None:
149 |             scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
150 |         scores = F.softmax(scores.float(), dim=-1).type_as(xq)
151 |         output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
152 |         output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
153 | 
154 |         return self.wo(output)
155 | 
156 | 
157 | class FeedForward(nn.Module):
158 |     def __init__(
159 |         self,
160 |         dim: int,
161 |         hidden_dim: int,
162 |         multiple_of: int,
163 |     ):
164 |         super().__init__()
165 |         hidden_dim = int(2 * hidden_dim / 3)
166 |         hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
167 | 
168 |         self.w1 = ColumnParallelLinear(
169 |             dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
170 |         )
171 |         self.w2 = RowParallelLinear(
172 |             hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
173 |         )
174 |         self.w3 = ColumnParallelLinear(
175 |             dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
176 |         )
177 | 
178 |     def forward(self, x):
179 |         return self.w2(F.silu(self.w1(x)) * self.w3(x))
180 | 
181 | 
182 | class TransformerBlock(nn.Module):
183 |     def __init__(self, layer_id: int, args: ModelArgs):
184 |         super().__init__()
185 |         self.n_heads = args.n_heads
186 |         self.dim = args.dim
187 |         self.head_dim = args.dim // args.n_heads
188 |         self.attention = Attention(args)
189 |         self.feed_forward = FeedForward(
190 |             dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
191 |         )
192 |         self.layer_id = layer_id
193 |         self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
194 |         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
195 | 
196 |     def forward(
197 |         self,
198 |         x: torch.Tensor,
199 |         start_pos: int,
200 |         freqs_cis: torch.Tensor,
201 |         mask: Optional[torch.Tensor],
202 |     ):
203 |         h = x + self.attention.forward(
204 |             self.attention_norm(x), start_pos, freqs_cis, mask
205 |         )
206 |         out = h + self.feed_forward.forward(self.ffn_norm(h))
207 |         return out
208 | 
209 | 
210 | class Transformer(nn.Module):
211 |     def __init__(self, params: ModelArgs):
212 |         super().__init__()
213 |         self.params = params
214 |         self.vocab_size = params.vocab_size
215 |         self.n_layers = params.n_layers
216 | 
217 |         self.tok_embeddings = ParallelEmbedding(
218 |             params.vocab_size, params.dim, init_method=lambda x: x
219 |         )
220 | 
221 |         self.layers = torch.nn.ModuleList()
222 |         for layer_id in range(params.n_layers):
223 |             self.layers.append(TransformerBlock(layer_id, params))
224 | 
225 |         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
226 |         self.output = ColumnParallelLinear(
227 |             params.dim, params.vocab_size, bias=False, init_method=lambda x: x
228 |         )
229 | 
230 |         self.freqs_cis = precompute_freqs_cis(
231 |             self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
232 |         )
233 | 
234 |     @torch.inference_mode()
235 |     def forward(self, tokens: torch.Tensor, start_pos: int):
236 |         _bsz, seqlen = tokens.shape
237 |         h = self.tok_embeddings(tokens)
238 |         self.freqs_cis = self.freqs_cis.to(h.device)
239 |         freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
240 | 
241 |         mask = None
242 |         if seqlen > 1:
243 |             mask = torch.full(
244 |                 (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device
245 |             )
246 |             mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
247 | 
248 |         for layer in self.layers:
249 |             h = layer(h, start_pos, freqs_cis, mask)
250 |         h = self.norm(h)
251 |         output = self.output(h[:, -1, :])  # only compute last logits
252 |         return output.float()
253 | 


--------------------------------------------------------------------------------
/llama/model_single.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | from dataclasses import dataclass
  3 | import math
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import hiq
  8 | 
  9 | 
 10 | @dataclass
 11 | class ModelArgs:
 12 |     dim: int = 512
 13 |     n_layers: int = 8
 14 |     n_heads: int = 8
 15 |     vocab_size: int = -1  # defined later by tokenizer
 16 |     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
 17 |     norm_eps: float = 1e-5
 18 | 
 19 |     max_batch_size: int = 1
 20 |     max_seq_len: int = 2048
 21 | 
 22 | 
 23 | class RMSNorm(torch.nn.Module):
 24 |     def __init__(self, dim: int, eps: float = 1e-6):
 25 |         super().__init__()
 26 |         self.eps = eps
 27 |         self.weight = nn.Parameter(torch.ones(dim))
 28 | 
 29 |     def _norm(self, x):
 30 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 31 | 
 32 |     def forward(self, x):
 33 |         output = self._norm(x.float()).type_as(x)
 34 |         return output * self.weight
 35 | 
 36 | 
 37 | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
 38 |     freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
 39 |     t = torch.arange(end, device=freqs.device)  # type: ignore
 40 |     freqs = torch.outer(t, freqs).float()  # type: ignore
 41 |     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
 42 |     return freqs_cis
 43 | 
 44 | 
 45 | def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
 46 |     ndim = x.ndim
 47 |     assert 0 <= 1 < ndim
 48 |     assert freqs_cis.shape == (x.shape[1], x.shape[-1])
 49 |     shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
 50 |     return freqs_cis.view(*shape)
 51 | 
 52 | 
 53 | def apply_rotary_emb(
 54 |     xq: torch.Tensor,
 55 |     xk: torch.Tensor,
 56 |     freqs_cis: torch.Tensor,
 57 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 58 |     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
 59 |     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
 60 |     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
 61 |     xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
 62 |     xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
 63 |     return xq_out.type_as(xq), xk_out.type_as(xk)
 64 | 
 65 | 
 66 | class Attention(nn.Module):
 67 |     def __init__(self, args: ModelArgs):
 68 |         super().__init__()
 69 | 
 70 |         self.n_local_heads = args.n_heads // 1
 71 |         self.head_dim = args.dim // args.n_heads
 72 | 
 73 |         self.wq = nn.Linear(
 74 |             args.dim,
 75 |             args.n_heads * self.head_dim,
 76 |             bias=False,
 77 |         )
 78 |         self.wk = nn.Linear(
 79 |             args.dim,
 80 |             args.n_heads * self.head_dim,
 81 |             bias=False,
 82 |         )
 83 |         self.wv = nn.Linear(
 84 |             args.dim,
 85 |             args.n_heads * self.head_dim,
 86 |             bias=False,
 87 |         )
 88 |         self.wo = nn.Linear(
 89 |             args.n_heads * self.head_dim,
 90 |             args.dim,
 91 |             bias=False,
 92 |         )
 93 |         self.cache_k = torch.zeros(
 94 |             (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
 95 |         )
 96 |         self.cache_v = torch.zeros(
 97 |             (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
 98 |         )
 99 |         if hiq.get_env_bool("KV_CAHCHE_IN_GPU", True):
100 |             self.cache_k = self.cache_k.cuda()
101 |             self.cache_v = self.cache_v.cuda()
102 | 
103 |     def forward(
104 |         self,
105 |         x: torch.Tensor,
106 |         start_pos: int,
107 |         freqs_cis: torch.Tensor,
108 |         mask: Optional[torch.Tensor],
109 |     ):
110 |         bsz, seqlen, _ = x.shape
111 |         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
112 | 
113 |         xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
114 |         xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
115 |         xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
116 | 
117 |         xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
118 | 
119 |         self.cache_k = self.cache_k.to(xq)
120 |         self.cache_v = self.cache_v.to(xq)
121 | 
122 |         self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
123 |         self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
124 | 
125 |         keys = self.cache_k[:bsz, : start_pos + seqlen]
126 |         values = self.cache_v[:bsz, : start_pos + seqlen]
127 | 
128 |         xq = xq.transpose(1, 2)
129 |         keys = keys.transpose(1, 2)
130 |         values = values.transpose(1, 2)
131 |         scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
132 |         if mask is not None:
133 |             scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
134 |         scores = F.softmax(scores.float(), dim=-1).type_as(xq)
135 |         output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
136 |         output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
137 | 
138 |         return self.wo(output)
139 | 
140 | 
141 | class FeedForward(nn.Module):
142 |     def __init__(
143 |         self,
144 |         dim: int,
145 |         hidden_dim: int,
146 |         multiple_of: int,
147 |     ):
148 |         super().__init__()
149 |         hidden_dim = int(2 * hidden_dim / 3)
150 |         hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
151 | 
152 |         self.w1 = nn.Linear(dim, hidden_dim, bias=False)
153 |         self.w2 = nn.Linear(hidden_dim, dim, bias=False)
154 |         self.w3 = nn.Linear(dim, hidden_dim, bias=False)
155 | 
156 |     def forward(self, x):
157 |         return self.w2(F.silu(self.w1(x)) * self.w3(x))
158 | 
159 | 
160 | class TransformerBlock(nn.Module):
161 |     def __init__(self, layer_id: int, args: ModelArgs):
162 |         super().__init__()
163 |         self.n_heads = args.n_heads
164 |         self.dim = args.dim
165 |         self.head_dim = args.dim // args.n_heads
166 |         self.attention = Attention(args)
167 |         self.feed_forward = FeedForward(
168 |             dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
169 |         )
170 |         self.layer_id = layer_id
171 |         self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
172 |         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
173 | 
174 |     def forward(
175 |         self,
176 |         x: torch.Tensor,
177 |         start_pos: int,
178 |         freqs_cis: torch.Tensor,
179 |         mask: Optional[torch.Tensor],
180 |     ):
181 |         h = x + self.attention.forward(
182 |             self.attention_norm(x), start_pos, freqs_cis, mask
183 |         )
184 |         out = h + self.feed_forward.forward(self.ffn_norm(h))
185 |         return out
186 | 
187 | 
188 | class Transformer(nn.Module):
189 |     def __init__(self, params: ModelArgs):
190 |         super().__init__()
191 |         self.params = params
192 |         self.vocab_size = params.vocab_size
193 |         self.n_layers = params.n_layers
194 | 
195 |         self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
196 | 
197 |         self.layers = torch.nn.ModuleList()
198 |         for layer_id in range(params.n_layers):
199 |             self.layers.append(TransformerBlock(layer_id, params))
200 | 
201 |         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
202 |         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
203 | 
204 |         self.freqs_cis = precompute_freqs_cis(
205 |             self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
206 |         )
207 | 
208 |     @torch.inference_mode()
209 |     def forward(self, tokens: torch.Tensor, start_pos: int):
210 |         _bsz, seqlen = tokens.shape
211 |         h = self.tok_embeddings(tokens)
212 |         self.freqs_cis = self.freqs_cis.to(h.device)
213 |         freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
214 | 
215 |         mask = None
216 |         if seqlen > 1:
217 |             mask = torch.full(
218 |                 (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device
219 |             )
220 |             mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
221 | 
222 |         for layer in self.layers:
223 |             h = layer(h, start_pos, freqs_cis, mask)
224 |         h = self.norm(h)
225 |         output = self.output(h[:, -1, :])  # only compute last logits
226 |         return output.float()
227 | 


--------------------------------------------------------------------------------
/llama/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from sentencepiece import SentencePieceProcessor
 5 | from logging import getLogger
 6 | from typing import *
 7 | import os
 8 | 
 9 | from transformers.tokenization_utils import PreTrainedTokenizer
10 | 
11 | logger = getLogger()
12 | 
13 | 
14 | class Tokenizer:
15 |     def __init__(self, model_path: str):
16 |         # reload tokenizer
17 |         assert os.path.isfile(model_path), model_path
18 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
19 |         #print(f"loaded SentencePiece model from {model_path}")
20 | 
21 |         # BOS / EOS token IDs
22 |         self.n_words: int = self.sp_model.vocab_size()
23 |         self.bos_id: int = self.sp_model.bos_id()
24 |         self.eos_id: int = self.sp_model.eos_id()
25 |         self.pad_id: int = self.sp_model.pad_id()
26 |         #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
27 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
28 | 
29 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
30 |         assert type(s) is str
31 |         t = self.sp_model.encode(s)
32 |         if bos:
33 |             t = [self.bos_id] + t
34 |         if eos:
35 |             t = t + [self.eos_id]
36 |         return t
37 | 
38 |     def decode(self, t: List[int]) -> str:
39 |         return self.sp_model.decode(t)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     def get_args():
44 |         import argparse
45 | 
46 |         parser = argparse.ArgumentParser()
47 |         parser.add_argument(
48 |             "--tokenizer_path", type=str, default="/llama_data/tokenizer.model"
49 |         )
50 |         return parser.parse_args()
51 | 
52 |     t = Tokenizer(model_path=get_args().tokenizer_path)
53 |     print(t.encode("hello world", False, False))
54 | 


--------------------------------------------------------------------------------
/llama/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.8"
2 | 


--------------------------------------------------------------------------------
/quant_infer.py:
--------------------------------------------------------------------------------
 1 | import hiq, time
 2 | from hiq.memory import total_gpu_memory_mb, get_memory_mb
 3 | import platform
 4 | 
 5 | 
 6 | def main():
 7 | 
 8 |     try:
 9 |         wsl = 'microsoft' in platform.uname()[2].lower()
10 |     except:
11 |         wsl = False
12 | 
13 |     driver = hiq.HiQLatency(
14 |         hiq_table_or_path=[
15 |             ["llama.llama_infer", "", "run", "run_quant"],
16 |             ["llama.llama_infer", "LLaMATokenizer", "from_pretrained", "from_pretrained"],
17 |             ["llama.hf", "LLaMATokenizer", "encode", "encode"],
18 |             ["llama.llama_infer", "", "load_quant", "load_quant"],
19 |             ["llama.hf.modeling_llama", "LLaMAForCausalLM", "generate", "generate"]
20 |         ],
21 |         metric_funcs=[time.time, get_memory_mb] + ([total_gpu_memory_mb] if not wsl else []),  # WSL does not contain nvidia-smi
22 |         # extra_metrics={hiq.ExtraMetrics.ARGS},
23 |     )
24 | 
25 |     args = hiq.mod("llama.llama_infer").get_args()
26 |     hiq.mod("llama.llama_infer").run(args)
27 |     print("*" * 30, ("GPU/" if not wsl else "") + "CPU/Latency Profiling", "*" * 30)
28 |     if wsl:
29 |         print('(WSL does not contain nvidia-smi, GPU profiling is disabled)')
30 |     driver.show()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/requirements-quant.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.26.0
2 | gptq>=0.0.2
3 | sentencepiece>=0.1.97
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.12.0
2 | fairscale>=0.4.13
3 | fire~=0.5.0
4 | hiq-python>=1.1.9
5 | sentencepiece==0.1.97
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from setuptools import setup, find_packages
 5 | import os
 6 | 
 7 | here = os.path.dirname(os.path.realpath(__file__))
 8 | 
 9 | 
10 | def _get_version():
11 |     with open(os.path.join(here, "llama", "version.py")) as f:
12 |         try:
13 |             version_line = next(line for line in f if line.startswith("__version__"))
14 |         except StopIteration:
15 |             raise ValueError("__version__ not defined in itree/version.py")
16 |         else:
17 |             ns = {}
18 |             exec(version_line, ns)  # pylint: disable=exec-used
19 |             return ns["__version__"]
20 | 
21 | 
22 | VERSION = _get_version()
23 | 
24 | 
25 | def read_file(filename: str):
26 |     try:
27 |         lines = []
28 |         with open(filename) as file:
29 |             lines = file.readlines()
30 |             lines = [line.rstrip() for line in lines if not line.startswith("#")]
31 |         return lines
32 |     except:
33 |         return []
34 | 
35 | 
36 | DESCRIPTION = "🦙 LLaMA: Open and Efficient Foundation Language Models in A Single GPU"
37 | 
38 | r_quant = read_file(f"{here}/requirements-quant.txt")
39 | r_basic = read_file(f"{here}/requirements.txt")
40 | 
41 | 
42 | def package_files(ds):
43 |     paths = []
44 |     for d in ds:
45 |         for path, directories, filenames in os.walk(d):
46 |             for filename in filenames:
47 |                 if "__pycache__" not in str(filename) and not filename.endswith('.pyc'):
48 |                     paths.append(str(os.path.join(path, filename))[len("llama/") :])
49 |     return paths
50 | 
51 | 
52 | extra_files = package_files(["llama/"])
53 | 
54 | setup(
55 |     name="pyllama",
56 |     version=VERSION,
57 |     author="Juncong Moo;Meta AI",
58 |     author_email="JuncongMoo@gmail.com",
59 |     description=DESCRIPTION,
60 |     long_description=open("README.md", "r", encoding="utf-8").read(),
61 |     long_description_content_type="text/markdown",
62 |     install_requires=r_basic,
63 |     package_data={"llama": extra_files},
64 |     include_package_data=True,
65 |     keywords=[
66 |         "LLaMA",
67 |     ],
68 |     classifiers=[
69 |         "Programming Language :: Python :: 3",
70 |         "Programming Language :: Python :: 3.6",
71 |         "Programming Language :: Python :: 3.7",
72 |         "Programming Language :: Python :: 3.8",
73 |         "Programming Language :: Python :: 3.9",
74 |         "Programming Language :: Python :: 3.10",
75 |         "Programming Language :: Python :: 3.11",
76 |     ],
77 |     url="https://github.com/juncongmoo/pyllama",
78 |     packages=["llama"],
79 |     extras_require={
80 |         "quant": r_quant,
81 |         "full": r_quant + r_basic,
82 |     },
83 | )
84 | 


--------------------------------------------------------------------------------