diff --git a/pristine_zip/reference.py b/pristine_zip/reference.py index 366757f..058c26c 100644 --- a/pristine_zip/reference.py +++ b/pristine_zip/reference.py @@ -1,92 +1,96 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Creates a ZIP file deterministically. This can be used a base reference for delta files.""" import os import subprocess from . import common from . import parameters def normalize_timestamps(entries): for entry in entries: os.utime(entry, (0, 0)) def normalize_perms(checkout_dir): # Info-ZIP has an extension that allows storing permissions; but not all # implementations support it (eg. not Python). So let's normalize by # wiping all permissions. for dir_entry in os.scandir(checkout_dir): if dir_entry.is_dir(follow_symlinks=False): os.chmod(dir_entry.path, 0o100755) normalize_perms(dir_entry.path) elif dir_entry.is_file(follow_symlinks=False): os.chmod(dir_entry.path, 0o100644) elif dir_entry.is_symlink(): os.chmod(dir_entry.path, 0o120000) else: assert False, f"Unknown DirEntry type: {dir_entry}" def walk(checkout_dir): entries = [] for (dirpath, dirnames, filenames) in os.walk(checkout_dir): assert dirpath.startswith(checkout_dir) dirpath = dirpath[len(checkout_dir) :].lstrip("/") entries.append(dirpath) paths = [os.path.join(dirpath, filename) for filename in filenames] entries.extend(paths) return entries def compress( executables: common.Executables, encoding_software: parameters.EncodingSoftware, checkout_dir: str, target: str, ): """Generates a reference zipball for the given checked out directory.""" try: os.remove(target) except FileNotFoundError: pass assert os.path.isdir(checkout_dir), checkout_dir entries_str = walk(checkout_dir) normalize_perms(checkout_dir) normalize_timestamps(os.path.join(checkout_dir, entry) for entry in entries_str) # Encode *before* sorting; sorting on unicode changes across configurations. entries = [entry.encode() for entry in entries_str] # Sort entries ourselves; InfoZIP's zip does not guarantee order entries.sort() if encoding_software == parameters.EncodingSoftware.INFOZIP_3_0: # -X = --no-extra, which prevents inclusion of extra non-deterministic # and implementation-dependant data # -o = --latest-time, which sets the modification time of the zip to that # of the most recent file assert executables.infozip_3_0 proc = subprocess.run( [executables.infozip_3_0, "-X", "-o", target, "--names-stdin"], cwd=checkout_dir, input=b"\n".join(entries), ) proc.check_returncode() elif encoding_software == parameters.EncodingSoftware.SEVENZIP_6_3: assert executables.sevenzip_6_3 assert target.endswith(".zip") proc = subprocess.run( [executables.sevenzip_6_3, "a", target, *entries], cwd=checkout_dir, capture_output=True, ) proc.check_returncode() + + # 7zip doesn't provide a way to disable its nondeterministic fields; + # let's remove them after the fact. + proc = subprocess.run(["strip-nondeterminism", "-t", "zip", target])