45 | Output
46 |
47 | ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no
48 | ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
49 | ggml_init_cublas: found 1 CUDA devices:
50 | Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
51 | llama_model_params { n_gpu_layers: 1000, split_mode: 1, main_gpu: 0, tensor_split: 0x0, progress_callback: None, progress_callback_user_data: 0x0, kv_overrides: 0x0, vocab_only: false, use_mmap: true, use_mlock: false }
52 | llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf (version GGUF V2)
53 | llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
54 | llama_model_loader: - kv 0: general.architecture str = llama
55 | llama_model_loader: - kv 1: general.name str = LLaMA v2
56 | llama_model_loader: - kv 2: llama.context_length u32 = 4096
57 | llama_model_loader: - kv 3: llama.embedding_length u32 = 4096
58 | llama_model_loader: - kv 4: llama.block_count u32 = 32
59 | llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008
60 | llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128
61 | llama_model_loader: - kv 7: llama.attention.head_count u32 = 32
62 | llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32
63 | llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
64 | llama_model_loader: - kv 10: general.file_type u32 = 15
65 | llama_model_loader: - kv 11: tokenizer.ggml.model str = llama
66 | llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<...
67 | llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...
68 | llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
69 | llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1
70 | llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2
71 | llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0
72 | llama_model_loader: - kv 18: general.quantization_version u32 = 2
73 | llama_model_loader: - type f32: 65 tensors
74 | llama_model_loader: - type q4_K: 193 tensors
75 | llama_model_loader: - type q6_K: 33 tensors
76 | llm_load_vocab: special tokens definition check successful ( 259/32000 ).
77 | llm_load_print_meta: format = GGUF V2
78 | llm_load_print_meta: arch = llama
79 | llm_load_print_meta: vocab type = SPM
80 | llm_load_print_meta: n_vocab = 32000
81 | llm_load_print_meta: n_merges = 0
82 | llm_load_print_meta: n_ctx_train = 4096
83 | llm_load_print_meta: n_embd = 4096
84 | llm_load_print_meta: n_head = 32
85 | llm_load_print_meta: n_head_kv = 32
86 | llm_load_print_meta: n_layer = 32
87 | llm_load_print_meta: n_rot = 128
88 | llm_load_print_meta: n_embd_head_k = 128
89 | llm_load_print_meta: n_embd_head_v = 128
90 | llm_load_print_meta: n_gqa = 1
91 | llm_load_print_meta: n_embd_k_gqa = 4096
92 | llm_load_print_meta: n_embd_v_gqa = 4096
93 | llm_load_print_meta: f_norm_eps = 0.0e+00
94 | llm_load_print_meta: f_norm_rms_eps = 1.0e-05
95 | llm_load_print_meta: f_clamp_kqv = 0.0e+00
96 | llm_load_print_meta: f_max_alibi_bias = 0.0e+00
97 | llm_load_print_meta: n_ff = 11008
98 | llm_load_print_meta: n_expert = 0
99 | llm_load_print_meta: n_expert_used = 0
100 | llm_load_print_meta: rope scaling = linear
101 | llm_load_print_meta: freq_base_train = 10000.0
102 | llm_load_print_meta: freq_scale_train = 1
103 | llm_load_print_meta: n_yarn_orig_ctx = 4096
104 | llm_load_print_meta: rope_finetuned = unknown
105 | llm_load_print_meta: model type = 7B
106 | llm_load_print_meta: model ftype = Q4_K - Medium
107 | llm_load_print_meta: model params = 6.74 B
108 | llm_load_print_meta: model size = 3.80 GiB (4.84 BPW)
109 | llm_load_print_meta: general.name = LLaMA v2
110 | llm_load_print_meta: BOS token = 1 ''
111 | llm_load_print_meta: EOS token = 2 ''
112 | llm_load_print_meta: UNK token = 0 ''
113 | llm_load_print_meta: LF token = 13 '<0x0A>'
114 | llm_load_tensors: ggml ctx size = 0.22 MiB
115 | llm_load_tensors: offloading 32 repeating layers to GPU
116 | llm_load_tensors: offloading non-repeating layers to GPU
117 | llm_load_tensors: offloaded 33/33 layers to GPU
118 | llm_load_tensors: CUDA0 buffer size = 3820.94 MiB
119 | llm_load_tensors: CPU buffer size = 70.31 MiB
120 | ..................................................................................................
121 | Loaded "/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf"
122 | llama_new_context_with_model: n_ctx = 2048
123 | llama_new_context_with_model: freq_base = 10000.0
124 | llama_new_context_with_model: freq_scale = 1
125 | llama_kv_cache_init: CUDA0 KV buffer size = 1024.00 MiB
126 | llama_new_context_with_model: KV self size = 1024.00 MiB, K (f16): 512.00 MiB, V (f16): 512.00 MiB
127 | llama_new_context_with_model: CUDA_Host input buffer size = 13.02 MiB
128 | ggml_gallocr_reserve_n: reallocating CUDA0 buffer from size 0.00 MiB to 164.01 MiB
129 | ggml_gallocr_reserve_n: reallocating CUDA_Host buffer from size 0.00 MiB to 8.00 MiB
130 | llama_new_context_with_model: CUDA0 compute buffer size = 164.01 MiB
131 | llama_new_context_with_model: CUDA_Host compute buffer size = 8.00 MiB
132 | llama_new_context_with_model: graph splits (measure): 3
133 | n_len = 32, n_ctx = 2048, k_kv_req = 32
134 |
135 | The way to kill a linux process is to send it a SIGKILL signal.
136 | The way to kill a windows process is to send it a S
137 |
138 | decoded 24 tokens in 0.23 s, speed 105.65 t/s
139 |
140 | load time = 727.50 ms
141 | sample time = 0.46 ms / 24 runs (0.02 ms per token, 51835.85 tokens per second)
142 | prompt eval time = 68.52 ms / 9 tokens (7.61 ms per token, 131.35 tokens per second)
143 | eval time = 225.70 ms / 24 runs (9.40 ms per token, 106.34 tokens per second)
144 | total time = 954.18 ms
145 |
146 |
147 |
148 | ## Hacking
149 |
150 | Ensure that when you clone this project you also clone the submodules. This can be done with the following command:
151 |
152 | ```sh
153 | git clone --recursive https://github.com/utilityai/llama-cpp-rs
154 | ```
155 |
156 | or if you have already cloned the project you can run:
157 |
158 | ```sh
159 | git submodule update --init --recursive
160 | ```
161 |
--------------------------------------------------------------------------------
/examples/embeddings/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "embeddings"
3 | version = "0.1.109"
4 | edition = "2021"
5 |
6 | [dependencies]
7 | llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.69" }
8 | hf-hub = { workspace = true }
9 | clap = { workspace = true, features = ["derive"] }
10 | anyhow = { workspace = true }
11 |
12 | [features]
13 | cuda = ["llama-cpp-2/cuda"]
14 | metal = ["llama-cpp-2/metal"]
15 | native = ["llama-cpp-2/native"]
16 | vulkan = ["llama-cpp-2/vulkan"]
17 |
18 | [lints]
19 | workspace = true
20 |
--------------------------------------------------------------------------------
/examples/embeddings/src/main.rs:
--------------------------------------------------------------------------------
1 | //! This is a translation of embedding.cpp in llama.cpp using llama-cpp-2.
2 | #![allow(
3 | clippy::cast_possible_wrap,
4 | clippy::cast_possible_truncation,
5 | clippy::cast_precision_loss,
6 | clippy::cast_sign_loss
7 | )]
8 |
9 | use std::io::Write;
10 | use std::path::PathBuf;
11 | use std::time::Duration;
12 |
13 | use anyhow::{bail, Context, Result};
14 | use clap::Parser;
15 | use hf_hub::api::sync::ApiBuilder;
16 |
17 | use llama_cpp_2::context::params::LlamaContextParams;
18 | use llama_cpp_2::context::LlamaContext;
19 | use llama_cpp_2::ggml_time_us;
20 | use llama_cpp_2::llama_backend::LlamaBackend;
21 | use llama_cpp_2::llama_batch::LlamaBatch;
22 | use llama_cpp_2::model::params::LlamaModelParams;
23 | use llama_cpp_2::model::LlamaModel;
24 | use llama_cpp_2::model::{AddBos, Special};
25 |
26 | #[derive(clap::Parser, Debug, Clone)]
27 | struct Args {
28 | /// The path to the model
29 | #[command(subcommand)]
30 | model: Model,
31 | /// The prompt
32 | #[clap(default_value = "Hello my name is")]
33 | prompt: String,
34 | /// Whether to normalise the produced embeddings
35 | #[clap(short)]
36 | normalise: bool,
37 | /// Disable offloading layers to the gpu
38 | #[cfg(any(feature = "cuda", feature = "vulkan"))]
39 | #[clap(long)]
40 | disable_gpu: bool,
41 | }
42 |
43 | #[derive(clap::Subcommand, Debug, Clone)]
44 | enum Model {
45 | /// Use an already downloaded model
46 | Local {
47 | /// The path to the model. e.g. `/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/blobs/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa`
48 | path: PathBuf,
49 | },
50 | /// Download a model from huggingface (or use a cached version)
51 | #[clap(name = "hf-model")]
52 | HuggingFace {
53 | /// the repo containing the model. e.g. `BAAI/bge-small-en-v1.5`
54 | repo: String,
55 | /// the model name. e.g. `BAAI-bge-small-v1.5.Q4_K_M.gguf`
56 | model: String,
57 | },
58 | }
59 |
60 | impl Model {
61 | /// Convert the model to a path - may download from huggingface
62 | fn get_or_load(self) -> Result