From f8c1f36be8116b1213e0e77df7fa9403ba3acd59 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Sun, 10 May 2026 22:53:57 -0700
Subject: [PATCH 1/7] fix(embed): mark all tokens as output to suppress
 llama.cpp 'overriding' INFO (#2208) (#2212)

---
 CHANGELOG.md       | 1 +
 llama_cpp/llama.py | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5031e5808..808a3647d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
+- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
 
 ## [0.3.22]
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 752c25dd3..2afa4c8e9 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1040,7 +1040,13 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # In embedding mode every input token must be marked as an output, regardless of
+        # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit
+        # "embeddings required but some input tokens were not marked as outputs ->
+        # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the
+        # per-token outputs are read back (see decode_batch below), not whether they are
+        # produced. See abetlen/llama-cpp-python#2208.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(

From 568411233f5f326f80c41c6e026bc80f27c00e69 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 10 May 2026 23:27:25 -0700
Subject: [PATCH 2/7] feat: update llama.cpp to 7d442abf (#2214)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 808a3647d..a783fab42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 - fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 5d6f18a63..7d442abf5 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb
+Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491

From 4a1a8ecd8047149b24a6d997f6f8c992d49aa99a Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 11 May 2026 03:07:09 -0700
Subject: [PATCH 3/7] chore: bump version to 0.3.23 (#2215)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a783fab42..645fd8005 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.23]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 78292de30..eb37da209 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.22"
+__version__ = "0.3.23"

From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001
From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com>
Date: Wed, 13 May 2026 16:35:30 -0400
Subject: [PATCH 4/7] fix(embedding): set kv_unified=True when embedding=True
 to enable batch processing (#2217)

* fix(embedding): set kv_unified=True when embedding=True to enable batch processing

* chore: update changelog for batch embedding fix

---------

Co-authored-by: abetlen <abetlen@gmail.com>
---
 CHANGELOG.md       | 2 ++
 llama_cpp/llama.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 645fd8005..900176ea1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
+
 ## [0.3.23]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2afa4c8e9..75c74b41f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -402,7 +402,7 @@ def __init__(
                 self.n_batch,
                 llama_cpp.llama_max_parallel_sequences(),
             )
-
+            self.context_params.kv_unified = True
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(

From 7664a3edc520ca0988db77f781984100070b050f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 15 May 2026 02:20:05 -0700
Subject: [PATCH 5/7] feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6
 (#2218)

* feat: update llama.cpp to 91e84fed6

* chore: document mtmd_caps c declaration
---
 CHANGELOG.md           |  1 +
 llama_cpp/llama_cpp.py |  3 +++
 llama_cpp/mtmd_cpp.py  | 24 ++++++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 900176ea1..a0b63061c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
 ## [0.3.23]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a5ec5d190..a9c32a15b 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -2837,6 +2837,9 @@ def llama_state_seq_load_file(
 ) -> int: ...
 
 
+# define LLAMA_STATE_SEQ_FLAGS_NONE 0
+LLAMA_STATE_SEQ_FLAGS_NONE = 0
+
 # for backwards-compat
 # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 485dc5d8c..f2b0ed2de 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure):
     ]
 
 
+# struct mtmd_caps {
+#     bool inp_vision;
+#     bool inp_audio;
+# };
+class mtmd_caps(Structure):
+    """Capabilities exposed by an mmproj file."""
+
+    if TYPE_CHECKING:
+        inp_vision: bool
+        inp_audio: bool
+
+    _fields_ = [
+        ("inp_vision", c_bool),
+        ("inp_audio", c_bool),
+    ]
+
+
 ################################################
 # mtmd.h functions
 ################################################
@@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float
     ...
 
 
+# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps)
+def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps:
+    """Get mmproj capabilities without initializing a full MTMD context."""
+    ...
+
+
 # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes)
 def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7d442abf5..91e84fed6 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491
+Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f

From c7bea7110b4371d51b1385afd7acb4c1842b2d49 Mon Sep 17 00:00:00 2001
From: shalinib-ibm <Shalini.Salomi.Bodapati@ibm.com>
Date: Fri, 15 May 2026 16:47:13 +0530
Subject: [PATCH 6/7] chore: migrate llama.cpp submodule to ggml-org (#2034)

Co-authored-by: abetlen <abetlen@gmail.com>
---
 .gitmodules  | 2 +-
 CHANGELOG.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..f56cca32d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = https://github.com/ggml-org/llama.cpp.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a0b63061c..36e4fa168 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
 ## [0.3.23]

From 5dd9b1ce2ceefe61779f92c1be539dd2df77c77c Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 18 May 2026 07:55:25 -0700
Subject: [PATCH 7/7] feat: Update llama.cpp to b9a2170fc (#2223)

---
 CHANGELOG.md           |  2 +-
 llama_cpp/llama_cpp.py | 21 +++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36e4fa168..18c6af161 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a9c32a15b..6560b5178 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
 LLAMA_SPLIT_MODE_TENSOR = 3
 
 
+# enum llama_context_type {
+#     LLAMA_CONTEXT_TYPE_DEFAULT = 0,
+#     LLAMA_CONTEXT_TYPE_MTP     = 1,
+# };
+LLAMA_CONTEXT_TYPE_DEFAULT = 0
+LLAMA_CONTEXT_TYPE_MTP = 1
+
+
 # typedef struct llama_token_data {
 #     llama_token id; // token id
 #     float logit;    // log-odds of the token
@@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure):
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
 #     uint32_t n_ubatch;          // physical maximum batch size
 #     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+#     uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
 #     int32_t  n_threads;         // number of threads to use for generation
 #     int32_t  n_threads_batch;   // number of threads to use for batch processing
 
+#     enum llama_context_type      ctx_type;          // set the context type (e.g. MTP)
 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
@@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure):
         n_batch (int): logical maximum batch size that can be submitted to llama_decode
         n_ubatch (int): physical maximum batch size
         n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
+        n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
+        ctx_type (int): context type, from `enum llama_context_type`
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
         pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         attention_type (int): attention type to use for embeddings
@@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure):
         n_batch: int
         n_ubatch: int
         n_seq_max: int
+        n_rs_seq: int
         n_threads: int
         n_threads_batch: int
+        ctx_type: int
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
@@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure):
         ("n_batch", ctypes.c_uint32),
         ("n_ubatch", ctypes.c_uint32),
         ("n_seq_max", ctypes.c_uint32),
+        ("n_rs_seq", ctypes.c_uint32),
         ("n_threads", ctypes.c_int32),
         ("n_threads_batch", ctypes.c_int32),
+        ("ctx_type", ctypes.c_int),
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
@@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 
 
+# LLAMA_API uint32_t llama_n_rs_seq   (const struct llama_context * ctx);
+@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ...
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 91e84fed6..b9a2170fc 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f
+Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348