From f8c1f36be8116b1213e0e77df7fa9403ba3acd59 Mon Sep 17 00:00:00 2001 From: Tai An Date: Sun, 10 May 2026 22:53:57 -0700 Subject: [PATCH 1/7] fix(embed): mark all tokens as output to suppress llama.cpp 'overriding' INFO (#2208) (#2212) --- CHANGELOG.md | 1 + llama_cpp/llama.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5031e5808..808a3647d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls +- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 ## [0.3.22] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..2afa4c8e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1040,7 +1040,13 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # In embedding mode every input token must be marked as an output, regardless of + # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit + # "embeddings required but some input tokens were not marked as outputs -> + # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the + # per-token outputs are read back (see decode_batch below), not whether they are + # produced. See abetlen/llama-cpp-python#2208. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError( From 568411233f5f326f80c41c6e026bc80f27c00e69 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 10 May 2026 23:27:25 -0700 Subject: [PATCH 2/7] feat: update llama.cpp to 7d442abf (#2214) --- CHANGELOG.md | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 808a3647d..a783fab42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls - fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5d6f18a63..7d442abf5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb +Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491 From 4a1a8ecd8047149b24a6d997f6f8c992d49aa99a Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 11 May 2026 03:07:09 -0700 Subject: [PATCH 3/7] chore: bump version to 0.3.23 (#2215) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a783fab42..645fd8005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.23] + - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 78292de30..eb37da209 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.22" +__version__ = "0.3.23" From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001 From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com> Date: Wed, 13 May 2026 16:35:30 -0400 Subject: [PATCH 4/7] fix(embedding): set kv_unified=True when embedding=True to enable batch processing (#2217) * fix(embedding): set kv_unified=True when embedding=True to enable batch processing * chore: update changelog for batch embedding fix --------- Co-authored-by: abetlen --- CHANGELOG.md | 2 ++ llama_cpp/llama.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 645fd8005..900176ea1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 + ## [0.3.23] - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2afa4c8e9..75c74b41f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -402,7 +402,7 @@ def __init__( self.n_batch, llama_cpp.llama_max_parallel_sequences(), ) - + self.context_params.kv_unified = True self._ctx = self._stack.enter_context( contextlib.closing( internals.LlamaContext( From 7664a3edc520ca0988db77f781984100070b050f Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 15 May 2026 02:20:05 -0700 Subject: [PATCH 5/7] feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 (#2218) * feat: update llama.cpp to 91e84fed6 * chore: document mtmd_caps c declaration --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 3 +++ llama_cpp/mtmd_cpp.py | 24 ++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 900176ea1..a0b63061c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 ## [0.3.23] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a5ec5d190..a9c32a15b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2837,6 +2837,9 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 + # for backwards-compat # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 485dc5d8c..f2b0ed2de 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure): ] +# struct mtmd_caps { +# bool inp_vision; +# bool inp_audio; +# }; +class mtmd_caps(Structure): + """Capabilities exposed by an mmproj file.""" + + if TYPE_CHECKING: + inp_vision: bool + inp_audio: bool + + _fields_ = [ + ("inp_vision", c_bool), + ("inp_audio", c_bool), + ] + + ################################################ # mtmd.h functions ################################################ @@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float ... +# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); +@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) +def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: + """Get mmproj capabilities without initializing a full MTMD context.""" + ... + + # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes) def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d442abf5..91e84fed6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491 +Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f From c7bea7110b4371d51b1385afd7acb4c1842b2d49 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Fri, 15 May 2026 16:47:13 +0530 Subject: [PATCH 6/7] chore: migrate llama.cpp submodule to ggml-org (#2034) Co-authored-by: abetlen --- .gitmodules | 2 +- CHANGELOG.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975d..f56cca32d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b63061c..36e4fa168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 ## [0.3.23] From 5dd9b1ce2ceefe61779f92c1be539dd2df77c77c Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 18 May 2026 07:55:25 -0700 Subject: [PATCH 7/7] feat: Update llama.cpp to b9a2170fc (#2223) --- CHANGELOG.md | 2 +- llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36e4fa168..18c6af161 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a9c32a15b..6560b5178 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None: LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +LLAMA_CONTEXT_TYPE_DEFAULT = 0 +LLAMA_CONTEXT_TYPE_MTP = 1 + + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + ctx_type (int): context type, from `enum llama_context_type` rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings @@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 91e84fed6..b9a2170fc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f +Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348