Patrick Williams | 169d7bc | 2024-01-05 11:33:25 -0600 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # Copyright OpenEmbedded Contributors |
| 4 | # |
| 5 | # SPDX-License-Identifier: MIT |
| 6 | # |
| 7 | |
| 8 | import argparse |
| 9 | import os |
| 10 | import re |
| 11 | import sys |
| 12 | |
| 13 | from collections import defaultdict |
| 14 | from concurrent.futures import ThreadPoolExecutor |
| 15 | from dataclasses import dataclass |
| 16 | from pathlib import Path |
| 17 | |
| 18 | if sys.version_info < (3, 8, 0): |
| 19 | raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.") |
| 20 | |
| 21 | SSTATE_PREFIX = "sstate:" |
| 22 | SSTATE_EXTENSION = ".tar.zst" |
| 23 | # SSTATE_EXTENSION = ".tgz" |
| 24 | # .siginfo.done files are mentioned in the original script? |
| 25 | SSTATE_SUFFIXES = ( |
| 26 | SSTATE_EXTENSION, |
| 27 | f"{SSTATE_EXTENSION}.siginfo", |
| 28 | f"{SSTATE_EXTENSION}.done", |
| 29 | ) |
| 30 | |
| 31 | RE_SSTATE_PKGSPEC = re.compile( |
| 32 | rf"""sstate:(?P<pn>[^:]*): |
| 33 | (?P<package_target>[^:]*): |
| 34 | (?P<pv>[^:]*): |
| 35 | (?P<pr>[^:]*): |
| 36 | (?P<sstate_pkgarch>[^:]*): |
| 37 | (?P<sstate_version>[^_]*): |
| 38 | (?P<bb_unihash>[^_]*)_ |
| 39 | (?P<bb_task>[^:]*) |
| 40 | (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""", |
| 41 | re.X, |
| 42 | ) |
| 43 | |
| 44 | |
| 45 | # Really we'd like something like a Path subclass which implements a stat |
| 46 | # cache here, unfortunately there's no good way to do that transparently |
| 47 | # (yet); see: |
| 48 | # |
| 49 | # https://github.com/python/cpython/issues/70219 |
| 50 | # https://discuss.python.org/t/make-pathlib-extensible/3428/77 |
| 51 | @dataclass |
| 52 | class SstateEntry: |
| 53 | """Class for keeping track of an entry in sstate-cache.""" |
| 54 | |
| 55 | path: Path |
| 56 | match: re.Match |
| 57 | stat_result: os.stat_result = None |
| 58 | |
| 59 | def __hash__(self): |
| 60 | return self.path.__hash__() |
| 61 | |
| 62 | def __getattr__(self, name): |
| 63 | return self.match.group(name) |
| 64 | |
| 65 | |
| 66 | # this is what's in the original script; as far as I can tell, it's an |
| 67 | # implementation artefact which we don't need? |
| 68 | def find_archs(): |
| 69 | # all_archs |
| 70 | builder_arch = os.uname().machine |
| 71 | |
| 72 | # FIXME |
| 73 | layer_paths = [Path("../..")] |
| 74 | |
| 75 | tune_archs = set() |
| 76 | re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"') |
| 77 | for path in layer_paths: |
| 78 | for tunefile in [ |
| 79 | p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file() |
| 80 | ]: |
| 81 | with open(tunefile) as f: |
| 82 | for line in f: |
| 83 | m = re_tune.match(line) |
| 84 | if m: |
| 85 | tune_archs.update(m.group(1).split()) |
| 86 | |
| 87 | # all_machines |
| 88 | machine_archs = set() |
| 89 | for path in layer_paths: |
| 90 | for machine_file in path.glob("meta*/conf/machine/*.conf"): |
| 91 | machine_archs.add(machine_file.parts[-1][:-5]) |
| 92 | |
| 93 | extra_archs = set() |
| 94 | all_archs = ( |
| 95 | set( |
| 96 | arch.replace("-", "_") |
| 97 | for arch in machine_archs | tune_archs | set(["allarch", builder_arch]) |
| 98 | ) |
| 99 | | extra_archs |
| 100 | ) |
| 101 | |
| 102 | print(all_archs) |
| 103 | |
| 104 | |
| 105 | # again, not needed? |
| 106 | def find_tasks(): |
| 107 | print(set([p.bb_task for p in paths])) |
| 108 | |
| 109 | |
| 110 | def collect_sstate_paths(args): |
| 111 | def scandir(path, paths): |
| 112 | # Assume everything is a directory; by not checking we avoid needing an |
| 113 | # additional stat which is potentially a synchronous roundtrip over NFS |
| 114 | try: |
| 115 | for p in path.iterdir(): |
| 116 | filename = p.parts[-1] |
| 117 | if filename.startswith(SSTATE_PREFIX): |
| 118 | if filename.endswith(SSTATE_SUFFIXES): |
| 119 | m = RE_SSTATE_PKGSPEC.match(p.parts[-1]) |
| 120 | assert m |
| 121 | paths.add(SstateEntry(p, m)) |
| 122 | # ignore other things (includes things like lockfiles) |
| 123 | else: |
| 124 | scandir(p, paths) |
| 125 | |
| 126 | except NotADirectoryError: |
| 127 | pass |
| 128 | |
| 129 | paths = set() |
| 130 | # TODO: parellise scandir |
| 131 | scandir(Path(args.cache_dir), paths) |
| 132 | |
| 133 | def path_stat(p): |
| 134 | p.stat_result = p.path.lstat() |
| 135 | |
| 136 | if args.remove_duplicated: |
| 137 | # This is probably slightly performance negative on a local filesystem |
| 138 | # when we interact with the GIL; over NFS it's a massive win. |
| 139 | with ThreadPoolExecutor(max_workers=args.jobs) as executor: |
| 140 | executor.map(path_stat, paths) |
| 141 | |
| 142 | return paths |
| 143 | |
| 144 | |
| 145 | def remove_by_stamps(args, paths): |
| 146 | all_sums = set() |
| 147 | for stamps_dir in args.stamps_dir: |
| 148 | stamps_path = Path(stamps_dir) |
| 149 | assert stamps_path.is_dir() |
| 150 | re_sigdata = re.compile(r"do_.*.sigdata\.([^.]*)") |
| 151 | all_sums |= set( |
| 152 | [ |
| 153 | re_sigdata.search(x.parts[-1]).group(1) |
| 154 | for x in stamps_path.glob("*/*/*.do_*.sigdata.*") |
| 155 | ] |
| 156 | ) |
| 157 | re_setscene = re.compile(r"do_.*_setscene\.([^.]*)") |
| 158 | all_sums |= set( |
| 159 | [ |
| 160 | re_setscene.search(x.parts[-1]).group(1) |
| 161 | for x in stamps_path.glob("*/*/*.do_*_setscene.*") |
| 162 | ] |
| 163 | ) |
| 164 | return [p for p in paths if p.bb_unihash not in all_sums] |
| 165 | |
| 166 | |
| 167 | def remove_duplicated(args, paths): |
| 168 | # Skip populate_lic as it produces duplicates in a normal build |
| 169 | # |
| 170 | # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates |
| 171 | valid_paths = [p for p in paths if p.bb_task != "populate_lic"] |
| 172 | |
| 173 | keep = dict() |
| 174 | remove = list() |
| 175 | for p in valid_paths: |
| 176 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext]) |
| 177 | if sstate_sig not in keep: |
| 178 | keep[sstate_sig] = p |
| 179 | elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime: |
| 180 | remove.append(keep[sstate_sig]) |
| 181 | keep[sstate_sig] = p |
| 182 | else: |
| 183 | remove.append(p) |
| 184 | |
| 185 | return remove |
| 186 | |
| 187 | |
| 188 | def remove_orphans(args, paths): |
| 189 | remove = list() |
| 190 | pathsigs = defaultdict(list) |
| 191 | for p in paths: |
| 192 | sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task]) |
| 193 | pathsigs[sstate_sig].append(p) |
| 194 | for k, v in pathsigs.items(): |
| 195 | if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0: |
| 196 | remove.extend(v) |
| 197 | return remove |
| 198 | |
| 199 | |
| 200 | def parse_arguments(): |
| 201 | parser = argparse.ArgumentParser(description="sstate cache management utility.") |
| 202 | |
| 203 | parser.add_argument( |
| 204 | "--cache-dir", |
| 205 | default=os.environ.get("SSTATE_CACHE_DIR"), |
| 206 | help="""Specify sstate cache directory, will use the environment |
| 207 | variable SSTATE_CACHE_DIR if it is not specified.""", |
| 208 | ) |
| 209 | |
| 210 | # parser.add_argument( |
| 211 | # "--extra-archs", |
| 212 | # help="""Specify list of architectures which should be tested, this list |
| 213 | # will be extended with native arch, allarch and empty arch. The |
| 214 | # script won't be trying to generate list of available archs from |
| 215 | # AVAILTUNES in tune files.""", |
| 216 | # ) |
| 217 | |
| 218 | # parser.add_argument( |
| 219 | # "--extra-layer", |
| 220 | # help="""Specify the layer which will be used for searching the archs, |
| 221 | # it will search the meta and meta-* layers in the top dir by |
| 222 | # default, and will search meta, meta-*, <layer1>, <layer2>, |
| 223 | # ...<layern> when specified. Use "," as the separator. |
| 224 | # |
| 225 | # This is useless for --stamps-dir or when --extra-archs is used.""", |
| 226 | # ) |
| 227 | |
| 228 | parser.add_argument( |
| 229 | "-d", |
| 230 | "--remove-duplicated", |
| 231 | action="store_true", |
| 232 | help="""Remove the duplicated sstate cache files of one package, only |
| 233 | the newest one will be kept. The duplicated sstate cache files |
| 234 | of one package must have the same arch, which means sstate cache |
| 235 | files with multiple archs are not considered duplicate. |
| 236 | |
| 237 | Conflicts with --stamps-dir.""", |
| 238 | ) |
| 239 | |
| 240 | parser.add_argument( |
| 241 | "--remove-orphans", |
| 242 | action="store_true", |
| 243 | help=f"""Remove orphan siginfo files from the sstate cache, i.e. those |
| 244 | where this is no {SSTATE_EXTENSION} file but there are associated |
| 245 | tracking files.""", |
| 246 | ) |
| 247 | |
| 248 | parser.add_argument( |
| 249 | "--stamps-dir", |
| 250 | action="append", |
| 251 | help="""Specify the build directory's stamps directories, the sstate |
| 252 | cache file which IS USED by these build diretories will be KEPT, |
| 253 | other sstate cache files in cache-dir will be removed. Can be |
| 254 | specified multiple times for several directories. |
| 255 | |
| 256 | Conflicts with --remove-duplicated.""", |
| 257 | ) |
| 258 | |
| 259 | parser.add_argument( |
| 260 | "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel." |
| 261 | ) |
| 262 | |
| 263 | # parser.add_argument( |
| 264 | # "-L", |
| 265 | # "--follow-symlink", |
| 266 | # action="store_true", |
| 267 | # help="Remove both the symbol link and the destination file, default: no.", |
| 268 | # ) |
| 269 | |
| 270 | parser.add_argument( |
| 271 | "-y", |
| 272 | "--yes", |
| 273 | action="store_true", |
| 274 | help="""Automatic yes to prompts; assume "yes" as answer to all prompts |
| 275 | and run non-interactively.""", |
| 276 | ) |
| 277 | |
| 278 | parser.add_argument( |
| 279 | "-v", "--verbose", action="store_true", help="Explain what is being done." |
| 280 | ) |
| 281 | |
| 282 | parser.add_argument( |
| 283 | "-D", |
| 284 | "--debug", |
| 285 | action="count", |
| 286 | default=0, |
| 287 | help="Show debug info, repeat for more debug info.", |
| 288 | ) |
| 289 | |
| 290 | args = parser.parse_args() |
| 291 | if args.cache_dir is None or ( |
| 292 | not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans |
| 293 | ): |
| 294 | parser.print_usage() |
| 295 | sys.exit(1) |
| 296 | |
| 297 | return args |
| 298 | |
| 299 | |
| 300 | def main(): |
| 301 | args = parse_arguments() |
| 302 | |
| 303 | paths = collect_sstate_paths(args) |
| 304 | if args.remove_duplicated: |
| 305 | remove = remove_duplicated(args, paths) |
| 306 | elif args.stamps_dir: |
| 307 | remove = remove_by_stamps(args, paths) |
| 308 | else: |
| 309 | remove = list() |
| 310 | |
| 311 | if args.remove_orphans: |
| 312 | remove = set(remove) | set(remove_orphans(args, paths)) |
| 313 | |
| 314 | if args.debug >= 1: |
| 315 | print("\n".join([str(p.path) for p in remove])) |
| 316 | print(f"{len(remove)} out of {len(paths)} files will be removed!") |
| 317 | if not args.yes: |
| 318 | print("Do you want to continue (y/n)?") |
| 319 | confirm = input() in ("y", "Y") |
| 320 | else: |
| 321 | confirm = True |
| 322 | if confirm: |
| 323 | # TODO: parallelise remove |
| 324 | for p in remove: |
| 325 | p.path.unlink() |
| 326 | |
| 327 | |
| 328 | if __name__ == "__main__": |
| 329 | main() |