From 775cfe8299ea5474f605935469359a9d1cdb49dc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 2 Jan 2025 08:20:58 +0100 Subject: [PATCH 01/18] update scripts to allow release (copied from smmap) --- Makefile | 42 +++++++----------------------------------- build-release.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 35 deletions(-) create mode 100755 build-release.sh diff --git a/Makefile b/Makefile index a0a2d0e..20436bb 100644 --- a/Makefile +++ b/Makefile @@ -1,40 +1,12 @@ -PYTHON = python3 -SETUP = $(PYTHON) setup.py -TESTFLAGS = +.PHONY: all clean release force_release -all:: +all: @grep -Ee '^[a-z].*:' Makefile | cut -d: -f1 | grep -vF all -release:: clean - # Check if latest tag is the current head we're releasing - echo "Latest tag = $$(git tag | sort -nr | head -n1)" - echo "HEAD SHA = $$(git rev-parse head)" - echo "Latest tag SHA = $$(git tag | sort -nr | head -n1 | xargs git rev-parse)" - @test "$$(git rev-parse head)" = "$$(git tag | sort -nr | head -n1 | xargs git rev-parse)" - make force_release +clean: + rm -rf build/ dist/ .eggs/ .tox/ -force_release:: clean - git push --tags - python3 -m build --sdist --wheel +force_release: clean + ./build-release.sh twine upload dist/* - -doc:: - make -C doc/ html - -build:: - $(SETUP) build - $(SETUP) build_ext -i - -build_ext:: - $(SETUP) build_ext -i - -install:: - $(SETUP) install - -clean:: - $(SETUP) clean --all - rm -f *.so - -coverage:: build - PYTHONPATH=. $(PYTHON) -m pytest --cov=gitdb gitdb - + git push --tags origin master diff --git a/build-release.sh b/build-release.sh new file mode 100755 index 0000000..5840e44 --- /dev/null +++ b/build-release.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# This script builds a release. If run in a venv, it auto-installs its tools. +# You may want to run "make release" instead of running this script directly. + +set -eEu + +function release_with() { + $1 -m build --sdist --wheel +} + +if test -n "${VIRTUAL_ENV:-}"; then + deps=(build twine) # Install twine along with build, as we need it later. + echo "Virtual environment detected. Adding packages: ${deps[*]}" + pip install --quiet --upgrade "${deps[@]}" + echo 'Starting the build.' + release_with python +else + function suggest_venv() { + venv_cmd='python -m venv env && source env/bin/activate' + printf "HELP: To avoid this error, use a virtual-env with '%s' instead.\n" "$venv_cmd" + } + trap suggest_venv ERR # This keeps the original exit (error) code. + echo 'Starting the build.' + release_with python3 # Outside a venv, use python3. +fi From 26209528a0303e47c88c174184adbf25d206a824 Mon Sep 17 00:00:00 2001 From: Eliah Kagan Date: Sun, 5 Jan 2025 03:21:33 -0500 Subject: [PATCH 02/18] Add SECURITY.md, referencing GitPython's Along with https://github.com/gitpython-developers/smmap/pull/59 and a forthcoming related PR in GitPython, this will fix #116. --- SECURITY.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..95389ff --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,3 @@ +# Security Policy + +See [GitPython](https://github.com/gitpython-developers/GitPython/blob/main/SECURITY.md). Vulnerabilities found in `gitdb` can be reported there. From 4fe56572894f9668c1ffd0808c96aed27c65e584 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 10:38:13 +0000 Subject: [PATCH 03/18] Bump gitdb/ext/smmap from `f31bfa3` to `8f82e6c` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `f31bfa3` to `8f82e6c`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/f31bfa378c8840d38d31e7e11ef2b84f191a491e...8f82e6c19661f9b735cc55cc89031a189e408894) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index f31bfa3..8f82e6c 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit f31bfa378c8840d38d31e7e11ef2b84f191a491e +Subproject commit 8f82e6c19661f9b735cc55cc89031a189e408894 From b4fd74ce8e28c372c511db2e0a491fa8b67c93f4 Mon Sep 17 00:00:00 2001 From: Eliah Kagan Date: Sun, 26 Jan 2025 11:51:11 -0500 Subject: [PATCH 04/18] Improve description of backoff sequence in db.loose The sequence of backoff wait times used in `gitdb.db.loose` is quadratic rather than exponential, as discussed in: https://github.com/gitpython-developers/gitdb/pull/115#discussion_r1903215598 This corrects the variable name by making it more general, and the comment by having it explicitly describe the backoff as quadratic. This is conceptually related to GitoxideLabs/gitoxide#1815, but this is a non-breaking change, as no interfaces are affected: only a local variable and comment. --- gitdb/db/loose.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py index 03d387e..e6765cd 100644 --- a/gitdb/db/loose.py +++ b/gitdb/db/loose.py @@ -230,16 +230,16 @@ def store(self, istream): # end rename only if needed # Ensure rename is actually done and file is stable - # Retry up to 14 times - exponential wait & retry in ms. + # Retry up to 14 times - quadratic wait & retry in ms. # The total maximum wait time is 1000ms, which should be vastly enough for the # OS to return and commit the file to disk. - for exp_backoff_ms in [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 181]: + for backoff_ms in [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 181]: with suppress(PermissionError): # make sure its readable for all ! It started out as rw-- tmp file # but needs to be rwrr chmod(obj_path, self.new_objects_mode) break - time.sleep(exp_backoff_ms / 1000.0) + time.sleep(backoff_ms / 1000.0) else: raise PermissionError( "Impossible to apply `chmod` to file {}".format(obj_path) From d7a7b3b1d398b3c70997b2971769560ff6bf7491 Mon Sep 17 00:00:00 2001 From: Eliah Kagan Date: Fri, 30 May 2025 16:18:10 -0400 Subject: [PATCH 05/18] Specify explicit `contents: read` workflow permissions This change is analogous to gitpython-developers/GitPython#2033. See also gitpython-developers/smmap#60. --- .github/workflows/pythonpackage.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 907698d..8fd6369 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,6 +5,9 @@ name: Python package on: [push, pull_request, workflow_dispatch] +permissions: + contents: read + jobs: build: From 8d57ac71980d7fc688acbdd8a45e1f7e0023bc81 Mon Sep 17 00:00:00 2001 From: Eliah Kagan Date: Fri, 30 May 2025 16:34:24 -0400 Subject: [PATCH 06/18] Add CI test job for no-GIL ("threaded") Python 3.13 See https://github.com/gitpython-developers/GitPython/issues/2005. The rationale is that, while this is probably less important to do in gitdb and smmap, any failure that arises for this in GitPython would likely raise the question of whether a correspond problem has begun to occur in gitdb and smmap. (Both gitdb and smmap provide helpers used in GitPython even when the in-memory object database is not used, and failures may plausibly occur for reasons other than code changes because of the finicky nature of concurrency bugs and the potential for interactions affected by the runner image.) --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 8fd6369..c5d7e2b 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] os: [ubuntu-latest] experimental: [false] include: From 18b437b65b339f0d76a3c07f4cef1de4fbcb527a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Jun 2025 11:11:28 +0000 Subject: [PATCH 07/18] Bump gitdb/ext/smmap from `8f82e6c` to `c6b53d3` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `8f82e6c` to `c6b53d3`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/8f82e6c19661f9b735cc55cc89031a189e408894...c6b53d35deb82a38d5d07ca7712c1334a7a10c10) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-version: c6b53d35deb82a38d5d07ca7712c1334a7a10c10 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index 8f82e6c..c6b53d3 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit 8f82e6c19661f9b735cc55cc89031a189e408894 +Subproject commit c6b53d35deb82a38d5d07ca7712c1334a7a10c10 From 366859fd74ec5dfe36443dcbc7e752383fb689fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:02:33 +0000 Subject: [PATCH 08/18] Bump actions/checkout from 4 to 5 Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index c5d7e2b..6730e7d 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -25,7 +25,7 @@ jobs: continue-on-error: ${{ matrix.experimental }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} From 70abd0ee5d4c9c7104c4f5ad009e82b45b71a852 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:46:16 +0000 Subject: [PATCH 09/18] Bump gitdb/ext/smmap from `c6b53d3` to `1de0797` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `c6b53d3` to `1de0797`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/c6b53d35deb82a38d5d07ca7712c1334a7a10c10...1de0797344ed031cc1d5f9024f01e8093b02baa9) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-version: 1de0797344ed031cc1d5f9024f01e8093b02baa9 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index c6b53d3..1de0797 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit c6b53d35deb82a38d5d07ca7712c1334a7a10c10 +Subproject commit 1de0797344ed031cc1d5f9024f01e8093b02baa9 From 7f39c7473e9799bdee5d08235470f1086ac16f02 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 14:42:01 +0000 Subject: [PATCH 10/18] Bump actions/setup-python from 5 to 6 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5 to 6. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/setup-python dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 6730e7d..30ebce4 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -29,7 +29,7 @@ jobs: with: fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} allow-prereleases: ${{ matrix.experimental }} From 707b78545690ff916e5441a93e3e41bba6769ee5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 14:54:24 +0000 Subject: [PATCH 11/18] Bump gitdb/ext/smmap from `1de0797` to `801bd6f` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `1de0797` to `801bd6f`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/1de0797344ed031cc1d5f9024f01e8093b02baa9...801bd6f5722aa21be54ea5b113b7a73595857e1c) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-version: 801bd6f5722aa21be54ea5b113b7a73595857e1c dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index 1de0797..801bd6f 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit 1de0797344ed031cc1d5f9024f01e8093b02baa9 +Subproject commit 801bd6f5722aa21be54ea5b113b7a73595857e1c From 8350bd5a434956d70959edeebf4f15f57bbe9157 Mon Sep 17 00:00:00 2001 From: sminux Date: Sat, 1 Nov 2025 09:16:41 +0300 Subject: [PATCH 12/18] Update pack.py - SonarQube issues fix #129 --- gitdb/pack.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gitdb/pack.py b/gitdb/pack.py index e559e11..2e07079 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -267,8 +267,6 @@ def close(self): def _set_cache_(self, attr): if attr == "_packfile_checksum": self._packfile_checksum = self._cursor.map()[-40:-20] - elif attr == "_packfile_checksum": - self._packfile_checksum = self._cursor.map()[-20:] elif attr == "_cursor": # Note: We don't lock the file when reading as we cannot be sure # that we can actually write to the location - it could be a read-only @@ -848,7 +846,6 @@ def is_valid_stream(self, sha, use_crc=False): assert shawriter.sha(as_hex=False) == sha return shawriter.sha(as_hex=False) == sha # END handle crc/sha verification - return True def info_iter(self): """ From b5a9cf80d3ca5d7067e723eb3a475df616619748 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 10:56:50 +0000 Subject: [PATCH 13/18] Bump gitdb/ext/smmap from `801bd6f` to `5ec977a` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `801bd6f` to `5ec977a`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/801bd6f5722aa21be54ea5b113b7a73595857e1c...5ec977a3b280e5dccb40cb20eba56ea26a84bd48) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-version: 5ec977a3b280e5dccb40cb20eba56ea26a84bd48 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index 801bd6f..5ec977a 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit 801bd6f5722aa21be54ea5b113b7a73595857e1c +Subproject commit 5ec977a3b280e5dccb40cb20eba56ea26a84bd48 From df9d041d6e753fd1b5f21ef7d4c994163e192127 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 11:06:18 +0000 Subject: [PATCH 14/18] Bump actions/checkout from 5 to 6 Bumps [actions/checkout](https://github.com/actions/checkout) from 5 to 6. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 30ebce4..ca5ae25 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -25,7 +25,7 @@ jobs: continue-on-error: ${{ matrix.experimental }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} From af034fceb98c27f2f57091365eb62353ec7a354c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:10:26 +0000 Subject: [PATCH 15/18] Bump gitdb/ext/smmap from `5ec977a` to `e4ad410` Bumps [gitdb/ext/smmap](https://github.com/gitpython-developers/smmap) from `5ec977a` to `e4ad410`. - [Release notes](https://github.com/gitpython-developers/smmap/releases) - [Commits](https://github.com/gitpython-developers/smmap/compare/5ec977a3b280e5dccb40cb20eba56ea26a84bd48...e4ad410ac3baf2046bd4043394e7cbb119045cc1) --- updated-dependencies: - dependency-name: gitdb/ext/smmap dependency-version: e4ad410ac3baf2046bd4043394e7cbb119045cc1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- gitdb/ext/smmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index 5ec977a..e4ad410 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit 5ec977a3b280e5dccb40cb20eba56ea26a84bd48 +Subproject commit e4ad410ac3baf2046bd4043394e7cbb119045cc1 From 5e74292fb75f40e0c5dd883760ee764277e92b48 Mon Sep 17 00:00:00 2001 From: Matt Van Horn Date: Thu, 30 Apr 2026 02:50:08 -0700 Subject: [PATCH 16/18] fix: replace deprecated codecs.open with built-in open (#128) Python 3.14 emits a DeprecationWarning for codecs.open(), which gitdb hits inside ReferenceDB._update_dbs_from_ref_file: DeprecationWarning: codecs.open() is deprecated. Use open() instead. The built-in open() has supported the encoding kwarg since Python 3.0 and the call site already passes encoding="utf-8", so the replacement is byte-for-byte equivalent on every supported Python version. Dropped the now-unused codecs import. Verified the change with the existing test_ref.py suite. Closes #128 --- gitdb/db/ref.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gitdb/db/ref.py b/gitdb/db/ref.py index bd30156..5536db0 100644 --- a/gitdb/db/ref.py +++ b/gitdb/db/ref.py @@ -2,7 +2,6 @@ # # This module is part of GitDB and is released under # the New BSD License: https://opensource.org/license/bsd-3-clause/ -import codecs from gitdb.db.base import ( CompoundDB, ) @@ -42,7 +41,7 @@ def _update_dbs_from_ref_file(self): # try to get as many as possible, don't fail if some are unavailable ref_paths = list() try: - with codecs.open(self._ref_file, 'r', encoding="utf-8") as f: + with open(self._ref_file, 'r', encoding="utf-8") as f: ref_paths = [l.strip() for l in f] except OSError: pass From 81cf3e5fd2a19842b21f30e158b7f41bf6d91f28 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Fri, 8 May 2026 04:38:09 -0700 Subject: [PATCH 17/18] Fix DecompressMemMapReader.read returning b'' before EOF Closes #120 DecompressMemMapReader.read(N) could return b'' mid-stream when zlib consumed input without producing output on a single decompress call (small N, header / dictionary frames in flight). The original `if dcompdat and ...` guard at the recursion site skipped the "refill to size" recursion in that case, so a caller using the standard idiom while chunk := stream.read(4096): yield chunk terminated at the first empty chunk -- before _br == _s. The guard exists for compressed_bytes_read(), which manipulates _br=0 and then drains the inner zip past its EOF. Recursing there would loop forever because the inner zip is already done. The fix uses zlib's own `eof` attribute (available on standard zlib.Decompress objects since Python 3.6) to distinguish: - dcompdat empty AND zip not at EOF -> still digesting, recurse - dcompdat empty AND zip at EOF -> compressed_bytes_read scrub or genuine EOF; do not recurse. `getattr(_zip, 'eof', False)` keeps the conservative behavior when running against a custom zlib object that does not expose the attribute. Adds a regression test that reads with chunk_size in {1, 4, 16, 64} from a 13 KB highly-compressible stream. With the old guard, the chunk_size <= 16 cases stopped at byte 0; the new test asserts they read all 13000 bytes. The full existing test suite (24 tests) still passes, including test_decompress_reader_special_case and test_pack which exercise the compressed_bytes_read scrub path that the original guard existed to protect. --- gitdb/stream.py | 136 ++++++++++++++++++++++---------------- gitdb/test/test_stream.py | 35 ++++++++++ 2 files changed, 114 insertions(+), 57 deletions(-) diff --git a/gitdb/stream.py b/gitdb/stream.py index 1e0be84..f5fc3bc 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -254,68 +254,90 @@ def read(self, size=-1): # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes during # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - # END handle tail - - # if window is too small, make it larger so zip can decompress something - if self._cwe - self._cws < 8: - self._cwe = self._cws + 8 - # END adjust winsize - - # takes a slice, but doesn't copy the data, it says ... - indata = self._m[self._cws:self._cwe] - - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - dcompdat = self._zip.decompress(indata, size) - # update the amount of compressed bytes read - # We feed possibly overlapping chunks, which is why the unconsumed tail - # has to be taken into consideration, as well as the unused data - # if we hit the end of the stream - # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. - # They are thorough, and I assume it is truly working. - # Why is this logic as convoluted as it is ? Please look at the table in - # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. - # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch - # will be the one that works. - # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the - # table in the github issue. This is it ... it was the only way I could make this work everywhere. - # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . - if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': - unused_datalen = len(self._zip.unconsumed_tail) - else: - unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) - # # end handle very special case ... - - self._cbr += len(indata) - unused_datalen - self._br += len(dcompdat) + # + # Decompress in a loop until we have produced `size` bytes or run out + # of progress. Iteration (instead of recursion) keeps the call bounded + # for streams that consume many input bytes per produced output byte + # (e.g. zlib stored blocks of length zero); the previous recursive + # form blew the stack on inputs > ~1500 empty blocks (issue #120 + # follow-up). + dcompdat = b'' + while True: + tail = self._zip.unconsumed_tail + remaining = size - len(dcompdat) + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + remaining + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + remaining + # END handle tail + + # if window is too small, make it larger so zip can decompress something + if self._cwe - self._cws < 8: + self._cwe = self._cws + 8 + # END adjust winsize + + # takes a slice, but doesn't copy the data, it says ... + indata = self._m[self._cws:self._cwe] + + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + chunk = self._zip.decompress(indata, remaining) + # update the amount of compressed bytes read + # We feed possibly overlapping chunks, which is why the unconsumed tail + # has to be taken into consideration, as well as the unused data + # if we hit the end of the stream + # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. + # They are thorough, and I assume it is truly working. + # Why is this logic as convoluted as it is ? Please look at the table in + # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. + # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # table in the github issue. This is it ... it was the only way I could make this work everywhere. + # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . + if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': + unused_datalen = len(self._zip.unconsumed_tail) + else: + unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) + # # end handle very special case ... + + consumed = len(indata) - unused_datalen + self._cbr += consumed + self._br += len(chunk) + dcompdat += chunk + + # Stop when we have enough or there is no path to more output. + # `chunk` may legitimately be empty mid-stream when zlib is + # consuming header / dictionary frames; in that case we keep + # iterating as long as we are still feeding zlib new bytes + # (consumed > 0) and zlib has not flagged end-of-stream. The + # compressed_bytes_read() scrub loop drives this same code with + # _br manipulated to 0 past zip EOF; it terminates here because + # `getattr(_zip, 'eof', False)` is True or no compressed bytes + # are consumed. The empty-block recursion attack from issue #120 + # follow-up is bounded by the iteration; each empty block does + # consume input, so the loop walks the stream forward a constant + # amount per iteration without growing the call stack. + if len(dcompdat) >= size or self._br >= self._s: + break + zip_eof = getattr(self._zip, 'eof', False) + if not chunk and (zip_eof or len(indata) == 0 or consumed == 0): + break + # END iterative decompress if dat: dcompdat = dat + dcompdat # END prepend our cached data - # it can happen, depending on the compression, that we get less bytes - # than ordered as it needs the final portion of the data as well. - # Recursively resolve that. - # Note: dcompdat can be empty even though we still appear to have bytes - # to read, if we are called by compressed_bytes_read - it manipulates - # us to empty the stream - if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: - dcompdat += self.read(size - len(dcompdat)) - # END handle special case return dcompdat diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 1e7e941..390caa1 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -162,3 +162,38 @@ def test_decompress_reader_special_case(self): dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) assert dump.hexsha == sha # end for each loose object sha to test + + def test_decompress_reader_chunked_read_does_not_terminate_early(self): + """Regression test for #120: read(N) must not return b'' before EOF. + + zlib can consume input without producing decompressed output (e.g. + while ingesting block headers). The reader's internal recursion + previously bailed on any empty zip output, so a caller reading in + small chunks via the standard `while chunk := stream.read(N)` idiom + would terminate at the first empty chunk -- before the actual end + of the uncompressed stream. + """ + # Highly compressible data exposes the bug because each zlib chunk + # spans many uncompressed bytes -- intermediate decompress() calls + # often return empty while consuming input. + data = b"hello world! " * 1000 + zdata = zlib.compress(data) + + # Loop with a small chunk size to force many sub-_s recursions. + for chunk_size in (1, 4, 16, 64): + reader = DecompressMemMapReader( + zdata, close_on_deletion=False, size=len(data) + ) + out = bytearray() + while True: + chunk = reader.read(chunk_size) + if not chunk: + break + out.extend(chunk) + assert bytes(out) == data, ( + f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes" + ) + assert reader._br == reader._s, ( + f"chunk_size={chunk_size}: stream stopped at " + f"{reader._br}/{reader._s}" + ) From ed41d2c26f77bc58a5b8f51100a300c4e09a7a30 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 9 May 2026 08:29:18 +0800 Subject: [PATCH 18/18] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- gitdb/stream.py | 7 +++++-- gitdb/test/test_stream.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/gitdb/stream.py b/gitdb/stream.py index f5fc3bc..7c59f0b 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -268,7 +268,7 @@ def read(self, size=-1): if tail: # move the window, make it as large as size demands. For code-clarity, # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up + # tail. The latter one would save some memory copying, but we could end up # with not getting enough data uncompressed, so we had to sort that out as well. # Now we just assume the worst case, hence the data is uncompressed and the window # needs to be as large as the uncompressed bytes we want to read. @@ -313,7 +313,10 @@ def read(self, size=-1): consumed = len(indata) - unused_datalen self._cbr += consumed self._br += len(chunk) - dcompdat += chunk + if chunk: + if not isinstance(dcompdat, bytearray): + dcompdat = bytearray(dcompdat) + dcompdat.extend(chunk) # Stop when we have enough or there is no path to more output. # `chunk` may legitimately be empty mid-stream when zlib is diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 390caa1..f36b06b 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -179,7 +179,8 @@ def test_decompress_reader_chunked_read_does_not_terminate_early(self): data = b"hello world! " * 1000 zdata = zlib.compress(data) - # Loop with a small chunk size to force many sub-_s recursions. + # Loop with a small chunk size to force many internal read/decompression + # iterations before EOF. for chunk_size in (1, 4, 16, 64): reader = DecompressMemMapReader( zdata, close_on_deletion=False, size=len(data)