diff --git a/.gitmodules b/.gitmodules index 7edf0975d..f56cca32d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 5031e5808..36e4fa168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 +- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 + +## [0.3.23] + +- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls +- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 ## [0.3.22] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 78292de30..eb37da209 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.22" +__version__ = "0.3.23" diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..75c74b41f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -402,7 +402,7 @@ def __init__( self.n_batch, llama_cpp.llama_max_parallel_sequences(), ) - + self.context_params.kv_unified = True self._ctx = self._stack.enter_context( contextlib.closing( internals.LlamaContext( @@ -1040,7 +1040,13 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # In embedding mode every input token must be marked as an output, regardless of + # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit + # "embeddings required but some input tokens were not marked as outputs -> + # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the + # per-token outputs are read back (see decode_batch below), not whether they are + # produced. See abetlen/llama-cpp-python#2208. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError( diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a5ec5d190..a9c32a15b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2837,6 +2837,9 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 + # for backwards-compat # define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 485dc5d8c..f2b0ed2de 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -134,6 +134,23 @@ class mtmd_decoder_pos(Structure): ] +# struct mtmd_caps { +# bool inp_vision; +# bool inp_audio; +# }; +class mtmd_caps(Structure): + """Capabilities exposed by an mmproj file.""" + + if TYPE_CHECKING: + inp_vision: bool + inp_audio: bool + + _fields_ = [ + ("inp_vision", c_bool), + ("inp_audio", c_bool), + ] + + ################################################ # mtmd.h functions ################################################ @@ -515,6 +532,13 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float ... +# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); +@ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) +def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: + """Get mmproj capabilities without initializing a full MTMD context.""" + ... + + # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function("mtmd_test_create_input_chunks", [], mtmd_input_chunks_p_ctypes) def mtmd_test_create_input_chunks() -> Optional[mtmd_input_chunks_p]: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5d6f18a63..91e84fed6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb +Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f