From c4816882afad798dd7d8b8551b34ea863216bdb6 Mon Sep 17 00:00:00 2001
From: Eric Froemling <ericfroemling@gmail.com>
Date: Fri, 28 Jul 2023 15:21:46 -0700
Subject: [PATCH] efrocache improvements

---
 CHANGELOG.md                 |  19 +++
 tools/efrotools/efrocache.py | 287 +++++++++++++++++++++--------------
 2 files changed, 189 insertions(+), 117 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index efcb7874..f79ad9bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,25 @@
   could set up a server that never gets pruned and contains all history from now
   until forever. Efrocache is basically just a big pile of files organized by
   their hashes (see `tools/efrotools/efrocache.py` for details).
+- The default efrocache file location is now `.cache/efrocache` instead of
+  `.efrocache`. Feel free to blow away any `.efrocache` dir if you still have
+  one (or move it to the new path to avoid having to download things again).
+- It is now possible to set an `EFROCACHE_DIR` env var to tell efrocache to
+  store its local files somewhere besides the per-project default of
+  `.cache/efrocache`. This can save a lot of download time if you want to share
+  it between multiple repos or are doing full cleans/rebuilds a lot (if it is
+  outside the project dir it won't get blown away during cleans). Efrocache dirs
+  are universal (again its just a big pile of files organized by hash) so there
+  should be no issues sharing cache dirs. Another nice side effect of
+  maintaining a single local efrocache dir is that anything you've ever built
+  will still be buildable; otherwise if your build tries to download very old
+  cache files they may no longer be available on my efrocache server.
+- Hardened efrocache code a bit so that failures during downloads or
+  decompresses are less likely to leave problematic half-made stuff lying
+  around. Namely, things are now always downloaded or decompressed into temp
+  dirs and only moved into their final locations once that completes
+  successfully. Its extra important to be safe now that its possible to share
+  local efrocache dirs between projects or otherwise keep them around longer.
   
 ### 1.7.24 (build 21199, api 8, 2023-07-27)
 
diff --git a/tools/efrotools/efrocache.py b/tools/efrotools/efrocache.py
index f949cc19..c11e989e 100644
--- a/tools/efrotools/efrocache.py
+++ b/tools/efrotools/efrocache.py
@@ -33,7 +33,6 @@ if TYPE_CHECKING:
 
 TARGET_TAG = '# __EFROCACHE_TARGET__'
 
-CACHE_DIR_NAME = '.efrocache'
 CACHE_MAP_NAME = '.efrocachemap'
 
 UPLOAD_STATE_CACHE_FILE = '.cache/efrocache_upload_state'
@@ -55,6 +54,23 @@ g_cache_prefix_noexec: bytes | None = None
 g_cache_prefix_exec: bytes | None = None
 
 
+def get_local_cache_dir() -> str:
+    """Where we store local efrocache files we've downloaded.
+
+    Rebuilds will be able to access the local cache instead of re-downloading.
+    By default each project has its own cache dir but this can be shared
+    between projects by setting the EFROCACHE_DIR environment variable.
+    """
+    envval = os.environ.get('EFROCACHE_DIR')
+    if not isinstance(envval, str):
+        envval = '.cache/efrocache'
+    if not envval:
+        raise RuntimeError('efrocache-local-dir cannot be an empty string.')
+    if envval.endswith('/') or envval.endswith('\\'):
+        raise RuntimeError('efrocache-local-dir must not end with a slash.')
+    return envval
+
+
 def get_repository_base_url() -> str:
     """Return the base repository url (assumes cwd is project root)."""
     # from efrotools import getprojectconfig
@@ -100,8 +116,12 @@ def get_target(path: str) -> None:
     """Fetch a target path from the cache, downloading if need be."""
     # pylint: disable=too-many-locals
     # pylint: disable=too-many-statements
+    import tempfile
+
     from efro.error import CleanError
 
+    local_cache_dir = get_local_cache_dir()
+
     path = _project_centric_path(path)
 
     with open(CACHE_MAP_NAME, encoding='utf-8') as infile:
@@ -118,13 +138,12 @@ def get_target(path: str) -> None:
     url = f'{repo}/{relurl}'
 
     subpath = '/'.join(url.split('/')[-3:])
-    local_cache_path = os.path.join(CACHE_DIR_NAME, subpath)
-    local_cache_path_dl = local_cache_path + '.download'
+    local_cache_path = os.path.join(local_cache_dir, subpath)
     hashval = ''.join(subpath.split('/'))
 
-    # First off: if there's already a file in place, check its hash. If
-    # it matches the cache, we can just update its timestamp and call it
-    # a day.
+    # First off: if there's already a cache file in place, check its
+    # hash. If its calced hash matches its path, we can just update its
+    # timestamp and call it a day.
     if os.path.isfile(path):
         existing_hash = get_existing_file_hash(path)
         if existing_hash == hashval:
@@ -132,51 +151,61 @@ def get_target(path: str) -> None:
             print(f'Refreshing from cache: {path}')
             return
 
+    # Ok we need to download the cache file.
     # Ok there's not a valid file in place already. Clear out whatever
     # is there to start with.
     if os.path.exists(path):
-        os.unlink(path)
+        os.remove(path)
 
-    # Now if we don't have this entry in our local cache,
-    # download it.
+    # Now, if we don't have this entry in our local cache, download it.
     if not os.path.exists(local_cache_path):
-        os.makedirs(os.path.dirname(local_cache_path), exist_ok=True)
-        print(f'Downloading: {Clr.BLU}{path}{Clr.RST}')
-        result = subprocess.run(
-            f'curl --fail --silent {url} --output {local_cache_path_dl}',
-            shell=True,
-            check=False,
-        )
-
-        # We prune old cache files on the server, so its possible for
-        # one to be trying to build something the server can no longer
-        # provide. try to explain the situation.
-        if result.returncode == 22:
-            raise CleanError(
-                'Server gave an error. Old build files may no longer'
-                ' be available on the server; make sure you are using'
-                ' a recent commit.\n'
-                'Note that build files will remain available'
-                ' indefinitely once downloaded, even if deleted by the'
-                f' server. So as long as your {CACHE_DIR_NAME} directory'
-                ' stays intact you should be able to repeat any builds you'
-                ' have run before.'
+        with tempfile.TemporaryDirectory() as tmpdir:
+            local_cache_dl_path = os.path.join(tmpdir, 'dl')
+            print(f'Downloading: {Clr.BLU}{path}{Clr.RST}')
+            result = subprocess.run(
+                [
+                    'curl',
+                    '--fail',
+                    '--silent',
+                    url,
+                    '--output',
+                    local_cache_dl_path,
+                ],
+                check=False,
             )
-        if result.returncode != 0:
-            raise CleanError('Download failed; is your internet working?')
 
-        subprocess.run(
-            f'mv {local_cache_path_dl} {local_cache_path}',
-            shell=True,
-            check=True,
-        )
+            # We prune old cache files on the server, so its possible for
+            # one to be trying to build something the server can no longer
+            # provide. try to explain the situation.
+            if result.returncode == 22:
+                raise CleanError(
+                    'Server gave an error. Old build files may no longer'
+                    ' be available on the server; make sure you are using'
+                    ' a recent commit.\n'
+                    'Note that build files will remain available'
+                    ' indefinitely once downloaded, even if deleted by the'
+                    f' server. So as long as your {local_cache_dir} directory'
+                    ' stays intact you should be able to repeat any builds you'
+                    ' have run before.'
+                )
+            if result.returncode != 0:
+                raise CleanError('Download failed; is your internet working?')
+
+            # Ok; cache download finished. Lastly move it in place to be as
+            # atomic as possible.
+            os.makedirs(os.path.dirname(local_cache_path), exist_ok=True)
+            subprocess.run(
+                ['mv', local_cache_dl_path, local_cache_path], check=True
+            )
 
     # Ok we should have a valid file in our cache dir at this point.
     # Just expand it to the target path.
 
     print(f'Extracting: {path}')
 
-    try:
+    # Extract and stage the file in a temp dir before doing
+    # a final move to the target location to be as atomic as possible.
+    with tempfile.TemporaryDirectory() as tmpdir:
         with open(local_cache_path, 'rb') as infile:
             data = infile.read()
         header = data[:4]
@@ -188,18 +217,16 @@ def get_target(path: str) -> None:
         metajson = metabytes.decode()
         metadata = dataclass_from_json(CacheMetadata, metajson)
         data = zlib.decompress(datac)
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        with open(path, 'wb') as outfile:
+
+        tmppath = os.path.join(tmpdir, 'out')
+        with open(tmppath, 'wb') as outfile:
             outfile.write(data)
         if metadata.executable:
-            subprocess.run(['chmod', '+x', path], check=True)
-    except Exception:
-        # If something goes wrong, try to make sure we don't leave a
-        # half decompressed file lying around or whatnot.
-        print(f"Error expanding cache archive for '{path}'.")
-        if os.path.exists(path):
-            os.remove(path)
-        raise
+            subprocess.run(['chmod', '+x', tmppath], check=True)
+
+        # Ok; we wrote the file. Now move it into its final place.
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        subprocess.run(['mv', tmppath, path], check=True)
 
     if not os.path.exists(path):
         raise RuntimeError(f'File {path} did not wind up as expected.')
@@ -340,8 +367,8 @@ def _upload_cache(
     # Now do the thing.
     staging_dir = 'build/efrocache'
     mapping_file = 'build/efrocachemap'
-    subprocess.run(f'rm -rf {staging_dir}', shell=True, check=True)
-    subprocess.run(f'mkdir -p {staging_dir}', shell=True, check=True)
+    subprocess.run(['rm', '-rf', staging_dir], check=True)
+    subprocess.run(['mkdir', '-p', staging_dir], check=True)
 
     _write_cache_files(fnames1, fnames2, staging_dir, mapping_file)
 
@@ -353,18 +380,26 @@ def _upload_cache(
     # Sync all individual cache files to the staging server.
     print(f'{Clr.SBLU}Pushing cache to staging...{Clr.RST}', flush=True)
     subprocess.run(
-        'rsync --progress --recursive --human-readable build/efrocache/'
-        ' ubuntu@staging.ballistica.net:files.ballistica.net/cache/ba1/',
-        shell=True,
+        [
+            'rsync',
+            '--progress',
+            '--recursive',
+            '--human-readable',
+            'build/efrocache/',
+            'ubuntu@staging.ballistica.net:files.ballistica.net/cache/ba1/',
+        ],
         check=True,
     )
 
     # Now generate the starter cache on the server..
     subprocess.run(
-        'ssh -oBatchMode=yes -oStrictHostKeyChecking=yes '
-        'ubuntu@staging.ballistica.net'
-        ' "cd files.ballistica.net/cache/ba1 && python3 genstartercache.py"',
-        shell=True,
+        [
+            'ssh',
+            '-oBatchMode=yes',
+            '-oStrictHostKeyChecking=yes',
+            'ubuntu@staging.ballistica.net',
+            'cd files.ballistica.net/cache/ba1 && python3 genstartercache.py',
+        ],
         check=True,
     )
 
@@ -393,11 +428,11 @@ def _write_cache_files(
     fhashes1: set[str] = set()
     fhashes2: set[str] = set()
     mapping: dict[str, str] = {}
-    call = functools.partial(_write_cache_file, staging_dir)
+    writecall = functools.partial(_write_cache_file, staging_dir)
 
     # Do the first set.
     with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
-        results = executor.map(call, fnames1)
+        results = executor.map(writecall, fnames1)
     for result in results:
         # mapping[result[0]] = f'{base_url}/{result[1]}'
         mapping[result[0]] = result[1]
@@ -405,7 +440,7 @@ def _write_cache_files(
 
     # Now finish up with the second set.
     with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
-        results = executor.map(call, fnames2)
+        results = executor.map(writecall, fnames2)
     for result in results:
         # mapping[result[0]] = f'{base_url}/result[1]'
         mapping[result[0]] = result[1]
@@ -455,33 +490,6 @@ def _write_cache_files(
         outfile.write(json.dumps(mapping, indent=2, sort_keys=True))
 
 
-def _cache_prefix_for_file(fname: str) -> bytes:
-    # pylint: disable=global-statement
-    global g_cache_prefix_exec
-    global g_cache_prefix_noexec
-
-    # We'll be calling this a lot when checking existing files, so we
-    # want it to be efficient. Let's cache the two options there are at
-    # the moment.
-    executable = os.access(fname, os.X_OK)
-    if executable:
-        if g_cache_prefix_exec is None:
-            metadata = dataclass_to_json(
-                CacheMetadata(executable=True)
-            ).encode()
-            assert len(metadata) < 256
-            g_cache_prefix_exec = (
-                CACHE_HEADER + len(metadata).to_bytes() + metadata
-            )
-        return g_cache_prefix_exec
-
-    # Ok; non-executable it is.
-    metadata = dataclass_to_json(CacheMetadata(executable=False)).encode()
-    assert len(metadata) < 256
-    g_cache_prefix_noexec = CACHE_HEADER + len(metadata).to_bytes() + metadata
-    return g_cache_prefix_noexec
-
-
 def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str]:
     import hashlib
 
@@ -511,6 +519,33 @@ def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str]:
     return fname, hashpath
 
 
+def _cache_prefix_for_file(fname: str) -> bytes:
+    # pylint: disable=global-statement
+    global g_cache_prefix_exec
+    global g_cache_prefix_noexec
+
+    # We'll be calling this a lot when checking existing files, so we
+    # want it to be efficient. Let's cache the two options there are at
+    # the moment.
+    executable = os.access(fname, os.X_OK)
+    if executable:
+        if g_cache_prefix_exec is None:
+            metadata = dataclass_to_json(
+                CacheMetadata(executable=True)
+            ).encode()
+            assert len(metadata) < 256
+            g_cache_prefix_exec = (
+                CACHE_HEADER + len(metadata).to_bytes() + metadata
+            )
+        return g_cache_prefix_exec
+
+    # Ok; non-executable it is.
+    metadata = dataclass_to_json(CacheMetadata(executable=False)).encode()
+    assert len(metadata) < 256
+    g_cache_prefix_noexec = CACHE_HEADER + len(metadata).to_bytes() + metadata
+    return g_cache_prefix_noexec
+
+
 def _check_warm_start_entry(entry: tuple[str, str]) -> None:
     # import hashlib
 
@@ -530,40 +565,58 @@ def _check_warm_start_entries(entries: list[tuple[str, str]]) -> None:
 
 def warm_start_cache() -> None:
     """Run a pre-pass on the efrocache to improve efficiency."""
+    import tempfile
 
     base_url = get_repository_base_url()
+    local_cache_dir = get_local_cache_dir()
 
     # We maintain a starter-cache on the staging server, which is simply
-    # the latest set of cache entries compressed into a single
-    # compressed archive. If we have no local cache yet we can download
-    # and expand this to give us a nice head start and greatly reduce
-    # the initial set of individual files we have to fetch. (downloading
-    # a single compressed archive is much more efficient than
-    # downloading thousands)
-    if not os.path.exists(CACHE_DIR_NAME):
-        print('Downloading asset starter-cache...', flush=True)
-        subprocess.run(
-            f'curl --fail {base_url}/startercache.tar.xz'
-            f' --output startercache.tar.xz',
-            shell=True,
-            check=True,
-        )
-        print('Decompressing starter-cache...', flush=True)
-        subprocess.run('tar -xf startercache.tar.xz', shell=True, check=True)
-        subprocess.run(f'mv efrocache {CACHE_DIR_NAME}', shell=True, check=True)
-        subprocess.run('rm startercache.tar.xz', shell=True, check=True)
-        print(
-            'Starter-cache fetched successfully!'
-            ' (should speed up asset builds)'
-        )
+    # a set of commonly used recent cache entries compressed into a
+    # single archive. If we have no local cache yet we can download and
+    # expand this to give us a nice head start and greatly reduce the
+    # initial set of individual files we have to fetch. (downloading a
+    # single compressed archive is much more efficient than downloading
+    # thousands)
+    if not os.path.exists(local_cache_dir):
+        print('Downloading efrocache starter-cache...', flush=True)
 
-    # In the public build, let's scan through all files managed by
-    # efrocache and update any with timestamps older than the latest
-    # cache-map that we already have the data for. Otherwise those files
-    # will update individually the next time they are 'built'. Even
-    # though that only takes a fraction of a second per file, it adds up
-    # when done for thousands of assets each time the cache map changes.
-    # It is much more efficient to do it in one go here.
+        # Download and decompress the starter-cache into a temp dir
+        # and then move it into place as our shiny new cache dir.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            starter_cache_file_path = os.path.join(
+                tmpdir, 'startercache.tar.xz'
+            )
+            subprocess.run(
+                [
+                    'curl',
+                    '--fail',
+                    f'{base_url}/startercache.tar.xz',
+                    '--output',
+                    starter_cache_file_path,
+                ],
+                check=True,
+            )
+            print('Decompressing starter-cache...', flush=True)
+            subprocess.run(
+                ['tar', '-xf', starter_cache_file_path], cwd=tmpdir, check=True
+            )
+            os.makedirs(os.path.dirname(local_cache_dir), exist_ok=True)
+            subprocess.run(
+                ['mv', os.path.join(tmpdir, 'efrocache'), local_cache_dir],
+                check=True,
+            )
+            print(
+                'Starter-cache fetched successfully! (should speed up builds).'
+            )
+
+    # In the public project, let's also scan through all project files
+    # managed by efrocache and update timestamps on any that we already
+    # have the data for to match the latest map. Otherwise those files
+    # will update their own timestamps individually the next time they
+    # are 'built'. Even though that only takes a fraction of a second
+    # per file, it adds up when done for thousands of files each time
+    # the cache map changes. It is much more efficient to do it all in
+    # one go here.
     cachemap: dict[str, str]
     with open(CACHE_MAP_NAME, encoding='utf-8') as infile:
         cachemap = json.loads(infile.read())
@@ -580,7 +633,7 @@ def warm_start_cache() -> None:
             continue
 
         # Don't have the cache source file for this guy = ignore.
-        cachefile = CACHE_DIR_NAME + '/' + '/'.join(url.split('/')[-3:])
+        cachefile = local_cache_dir + '/' + '/'.join(url.split('/')[-3:])
         if not os.path.exists(cachefile):
             continue