diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..f56cca32d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = https://github.com/ggml-org/llama.cpp.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5031e5808..36e4fa168 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
+- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
+- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
+
+## [0.3.23]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
+- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
 
 ## [0.3.22]
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 78292de30..eb37da209 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.22"
+__version__ = "0.3.23"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 752c25dd3..75c74b41f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -402,7 +402,7 @@ def __init__(
                 self.n_batch,
                 llama_cpp.llama_max_parallel_sequences(),
             )
-
+            self.context_params.kv_unified = True
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(
@@ -1040,7 +1040,13 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # In embedding mode every input token must be marked as an output, regardless of
+        # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit
+        # "embeddings required but some input tokens were not marked as outputs ->
+        # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the
+        # per-token outputs are read back (see decode_batch below), not whether they are
+        # produced. See abetlen/llama-cpp-python#2208.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a5ec5d190..a9c32a15b 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -2837,6 +2837,9 @@ def llama_state_seq_load_file(
 ) -> int: ...
 
 
+# define LLAMA_STATE_SEQ_FLAGS_NONE 0
+LLAMA_STATE_SEQ_FLAGS_NONE = 0
+
 # for backwards-compat
 # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 485dc5d8c..f2b0ed2de 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure):
     ]
 
 
+# struct mtmd_caps {
+#     bool inp_vision;
+#     bool inp_audio;
+# };
+class mtmd_caps(Structure):
+    """Capabilities exposed by an mmproj file."""
+
+    if TYPE_CHECKING:
+        inp_vision: bool
+        inp_audio: bool
+
+    _fields_ = [
+        ("inp_vision", c_bool),
+        ("inp_audio", c_bool),
+    ]
+
+
 ################################################
 # mtmd.h functions
 ################################################
@@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float
     ...
 
 
+# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps)
+def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps:
+    """Get mmproj capabilities without initializing a full MTMD context."""
+    ...
+
+
 # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes)
 def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 5d6f18a63..91e84fed6 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb
+Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f