Skip to content

Compression plugin #18314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: develop2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions conan/api/subapi/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import json
import os
import shutil
import tarfile
from io import BytesIO

from conan.api.model import PackagesList
from conan.api.output import ConanOutput
from conan.internal.api.uploader import gzopen_without_timestamps
from conan.internal.cache.cache import PkgCache
from conan.internal.cache.conan_reference_layout import EXPORT_SRC_FOLDER, EXPORT_FOLDER, SRC_FOLDER, \
METADATA, DOWNLOAD_EXPORT_FOLDER
Expand All @@ -18,6 +15,7 @@
from conan.api.model import RecipeReference
from conan.internal.util.dates import revision_timestamp_now
from conan.internal.util.files import rmdir, mkdir, remove
from conan.internal.util.compression import tar_compressor, tar_extract


class CacheAPI:
Expand Down Expand Up @@ -133,9 +131,10 @@ def save(self, package_list, tgz_path, no_source=False):
mkdir(os.path.dirname(tgz_path))
name = os.path.basename(tgz_path)
compresslevel = global_conf.get("core.gzip:compresslevel", check_type=int)

with open(tgz_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, fileobj=tgz_handle,
compresslevel=compresslevel)
tgz = tar_compressor(name, fileobj=tgz_handle, compresslevel=compresslevel,
cache_path=self.conan_api.cache_folder)
for ref, ref_bundle in package_list.refs().items():
ref_layout = cache.recipe_layout(ref)
recipe_folder = os.path.relpath(ref_layout.base_folder, cache_folder)
Expand Down Expand Up @@ -169,11 +168,12 @@ def save(self, package_list, tgz_path, no_source=False):
out.info(f"Saving {pref} metadata: {metadata_folder}")
tgz.add(os.path.join(cache_folder, metadata_folder), metadata_folder,
recursive=True)
# Create pgklist.json to add it to the tgz
serialized = json.dumps(package_list.serialize(), indent=2)
info = tarfile.TarInfo(name="pkglist.json")
data = serialized.encode('utf-8')
info.size = len(data)
tgz.addfile(tarinfo=info, fileobj=BytesIO(data))
pkglist_path = os.path.join(cache_folder, "pkglist.json")
with open(pkglist_path, "w") as file_handler:
file_handler.write(serialized)
tgz.add(pkglist_path, "pkglist.json", recursive=False)
tgz.close()

def restore(self, path):
Expand All @@ -182,14 +182,10 @@ def restore(self, path):

cache = PkgCache(self.conan_api.cache_folder, self.conan_api.config.global_conf)
cache_folder = cache.store # Note, this is not the home, but the actual package cache

with open(path, mode='rb') as file_handler:
the_tar = tarfile.open(fileobj=file_handler)
fileobj = the_tar.extractfile("pkglist.json")
pkglist = fileobj.read()
the_tar.extraction_filter = (lambda member, _: member) # fully_trusted (Py 3.14)
the_tar.extractall(path=cache_folder)
the_tar.close()
tar_extract(path, cache_folder, cache_folder=self.conan_api.cache_folder)
# Retrieve the package list from the already extracted archive
with open(os.path.join(cache_folder, "pkglist.json")) as file_handler:
pkglist = file_handler.read()

# After unzipping the files, we need to update the DB that references these files
out = ConanOutput()
Expand Down
11 changes: 8 additions & 3 deletions conan/internal/api/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from conan.errors import ConanException
from conan.internal.paths import (CONAN_MANIFEST, CONANFILE, EXPORT_SOURCES_TGZ_NAME,
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, CONANINFO)
from conan.internal.util.compression import load_compress_plugin
from conan.internal.util.files import (clean_dirty, is_dirty, gather_files,
set_dirty_context_manager, mkdir, human_size)

Expand Down Expand Up @@ -157,7 +158,7 @@ def add_tgz(tgz_name, tgz_files):
elif tgz_files:
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
tgz = compress_files(tgz_files, tgz_name, download_export_folder,
compresslevel=compresslevel, ref=ref)
compresslevel=compresslevel, ref=ref, cache_folder=self._app.cache_folder)
result[tgz_name] = tgz

add_tgz(EXPORT_TGZ_NAME, files)
Expand Down Expand Up @@ -204,7 +205,7 @@ def _compress_package_files(self, layout, pref):
tgz_files = {f: path for f, path in files.items()}
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
tgz_path = compress_files(tgz_files, PACKAGE_TGZ_NAME, download_pkg_folder,
compresslevel=compresslevel, ref=pref)
compresslevel=compresslevel, ref=pref, cache_folder=self._app.cache_folder)
assert tgz_path == package_tgz
assert os.path.exists(package_tgz)

Expand Down Expand Up @@ -271,7 +272,11 @@ def gzopen_without_timestamps(name, fileobj, compresslevel=None):
return t


def compress_files(files, name, dest_dir, compresslevel=None, ref=None):
def compress_files(files, name, dest_dir, compresslevel=None, ref=None, cache_folder=None):
compress_plugin = load_compress_plugin(cache_folder)
if compress_plugin:
return compress_plugin.tar_compress(files, name, dest_dir, compresslevel, ref)

t1 = time.time()
# FIXME, better write to disk sequentially and not keep tgz contents in memory
tgz_path = os.path.join(dest_dir, name)
Expand Down
4 changes: 4 additions & 0 deletions conan/internal/cache/home_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,7 @@ def settings_path_user(self):
@property
def config_version_path(self):
return os.path.join(self._home, "config_version.json")

@property
def compression_plugin_path(self):
Copy link
Preview

Copilot AI May 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Add a docstring explaining that this property returns the path to the compression.py plugin under the extensions directory.

Suggested change
def compression_plugin_path(self):
def compression_plugin_path(self):
"""Returns the path to the `compression.py` plugin under the extensions directory."""

Copilot uses AI. Check for mistakes.

return os.path.join(self._home, _EXTENSIONS_FOLDER, _PLUGINS, "compression.py")
17 changes: 11 additions & 6 deletions conan/internal/rest/remote_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from conan.internal.model.info import load_binary_info
from conan.api.model import PkgReference
from conan.api.model import RecipeReference
from conan.internal.util.compression import load_compress_plugin
from conan.internal.util.files import rmdir, human_size
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
from conan.internal.util.files import mkdir, tar_extract
Expand Down Expand Up @@ -81,7 +82,7 @@ def get_recipe(self, ref, remote, metadata=None):
tgz_file = zipped_files.pop(EXPORT_TGZ_NAME, None)

if tgz_file:
uncompress_file(tgz_file, export_folder, scope=str(ref))
uncompress_file(tgz_file, export_folder, scope=str(ref), cache_folder=self._home_folder)
mkdir(export_folder)
for file_name, file_path in zipped_files.items(): # copy CONANFILE
shutil.move(file_path, os.path.join(export_folder, file_name))
Expand Down Expand Up @@ -123,7 +124,7 @@ def get_recipe_sources(self, ref, layout, remote):

self._signer.verify(ref, download_folder, files=zipped_files)
tgz_file = zipped_files[EXPORT_SOURCES_TGZ_NAME]
uncompress_file(tgz_file, export_sources_folder, scope=str(ref))
uncompress_file(tgz_file, export_sources_folder, scope=str(ref), cache_folder=self._home_folder)

def get_package(self, pref, remote, metadata=None):
output = ConanOutput(scope=str(pref.ref))
Expand Down Expand Up @@ -171,7 +172,7 @@ def _get_package(self, layout, pref, remote, scoped_output, metadata):

tgz_file = zipped_files.pop(PACKAGE_TGZ_NAME, None)
package_folder = layout.package()
uncompress_file(tgz_file, package_folder, scope=str(pref.ref))
uncompress_file(tgz_file, package_folder, scope=str(pref.ref), cache_folder=self._home_folder)
mkdir(package_folder) # Just in case it doesn't exist, because uncompress did nothing
for file_name, file_path in zipped_files.items(): # copy CONANINFO and CONANMANIFEST
shutil.move(file_path, os.path.join(package_folder, file_name))
Expand Down Expand Up @@ -281,15 +282,19 @@ def _call_remote(self, remote, method, *args, **kwargs):
raise ConanException(exc, remote=remote)


def uncompress_file(src_path, dest_folder, scope=None):
def uncompress_file(src_path, dest_folder, scope=None, cache_folder=None):
try:
filesize = os.path.getsize(src_path)
big_file = filesize > 10000000 # 10 MB
if big_file:
hs = human_size(filesize)
ConanOutput(scope=scope).info(f"Decompressing {hs} {os.path.basename(src_path)}")
with open(src_path, mode='rb') as file_handler:
tar_extract(file_handler, dest_folder)

compression_plugin = load_compress_plugin(cache_folder)
if compression_plugin:
compression_plugin.tar_extract(src_path, dest_folder)
else:
tar_extract(src_path, dest_folder)
except Exception as e:
error_msg = "Error while extracting downloaded file '%s' to %s\n%s\n"\
% (src_path, dest_folder, str(e))
Expand Down
91 changes: 91 additions & 0 deletions conan/internal/util/compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from conan.internal.cache.home_paths import HomePaths
from conan.internal.loader import load_python_file
from conan.internal.errors import ConanException

import os
import gzip
import time
import tarfile
from conan.api.output import ConanOutput
from conan.internal.util.files import set_dirty_context_manager

def tar_extract(src_path, destination_dir, cache_folder=None):
compress_plugin = load_compress_plugin(cache_folder)
if compress_plugin:
return compress_plugin.tar_extract(src_path, destination_dir)

with open(src_path, mode='rb') as file_handler:
the_tar = tarfile.open(fileobj=file_handler)
# NOTE: The errorlevel=2 has been removed because it was failing in Win10, it didn't allow to
# "could not change modification time", with time=0
# the_tar.errorlevel = 2 # raise exception if any error
the_tar.extraction_filter = (lambda member, path: member) # fully_trusted, avoid Py3.14 break
the_tar.extractall(path=destination_dir)
the_tar.close()


def tar_compress(files, name, dest_dir, compresslevel=None, ref=None, cache_folder=None):
compress_plugin = load_compress_plugin(cache_folder)
if compress_plugin:
return compress_plugin.tar_compress(files, name, dest_dir, compresslevel, ref)

t1 = time.time()
# FIXME, better write to disk sequentially and not keep tgz contents in memory
tgz_path = os.path.join(dest_dir, name)
ConanOutput(scope=str(ref)).info(f"Compressing {name}")
with set_dirty_context_manager(tgz_path), open(tgz_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, fileobj=tgz_handle, compresslevel=compresslevel)
for filename, abs_path in sorted(files.items()):
# recursive is False in case it is a symlink to a folder
tgz.add(abs_path, filename, recursive=False)
tgz.close()

duration = time.time() - t1
ConanOutput().debug(f"{name} compressed in {duration} time")
return tgz_path

def tar_compressor(name, fileobj, compresslevel, cache_path=None):
compress_plugin = load_compress_plugin(cache_path)
if compress_plugin:
return compress_plugin.TarCompressor(name, fileobj, compresslevel)
else:
return gzopen_without_timestamps(name, fileobj, compresslevel)


def gzopen_without_timestamps(name, fileobj, compresslevel=None):
""" !! Method overrided by laso to pass mtime=0 (!=None) to avoid time.time() was
setted in Gzip file causing md5 to change. Not possible using the
previous tarfile open because arguments are not passed to GzipFile constructor
"""
compresslevel = compresslevel if compresslevel is not None else 9 # default Gzip = 9
fileobj = gzip.GzipFile(name, "w", compresslevel, fileobj, mtime=0)
# Format is forced because in Python3.8, it changed and it generates different tarfiles
# with different checksums, which break hashes of tgzs
# PAX_FORMAT is the default for Py38, lets make it explicit for older Python versions
t = tarfile.TarFile.taropen(name, "w", fileobj, format=tarfile.PAX_FORMAT)
t._extfileobj = False
return t


def load_compress_plugin(cache_folder):
if not cache_folder:
return None
compression_plugin_path = HomePaths(cache_folder).compression_plugin_path
if not os.path.exists(compression_plugin_path):
return None

mod, _ = load_python_file(compression_plugin_path)
if not hasattr(mod, "tar_extract") or not hasattr(mod, "tar_compress"):
raise ConanException("The 'compression.py' plugin does not contain required `tar_extract` or `tar_compress` functions")
return mod


"""
Plugin `compression.py` interface:

def tar_extract(src_path, destination_dir) -> None
def tar_compress(files, name, dest_dir, compresslevel=None, ref=None) -> str
class TarCompressor(name, fileobj, compresslevel)
def add(self, abs_path, filename, recursive=True) -> None
def close() -> None
"""
Loading