mirror of
https://github.com/RYDE-WORK/ballistica.git
synced 2026-01-21 22:33:58 +08:00
468 lines
18 KiB
Python
468 lines
18 KiB
Python
# Copyright (c) 2011-2020 Eric Froemling
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
# in the Software without restriction, including without limitation the rights
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|
# furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in
|
|
# all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
# SOFTWARE.
|
|
# -----------------------------------------------------------------------------
|
|
"""A simple cloud caching system for making built binaries/assets available.
|
|
|
|
The basic idea here is the ballistica-internal project can flag file targets
|
|
in its Makefiles as 'cached', and the public version of those Makefiles will
|
|
be filtered to contain cache downloads in place of the original build commands.
|
|
Cached files are gathered and uploaded as part of the pubsync process.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import subprocess
|
|
import json
|
|
from typing import TYPE_CHECKING
|
|
from multiprocessing import cpu_count
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from efro.terminal import Clr
|
|
|
|
if TYPE_CHECKING:
|
|
from typing import List, Dict, Tuple, Set
|
|
|
|
BASE_URL = 'https://files.ballistica.net/cache/ba1/'
|
|
|
|
TARGET_TAG = '#__EFROCACHE_TARGET__'
|
|
STRIP_BEGIN_TAG = '#__EFROCACHE_STRIP_BEGIN__'
|
|
STRIP_END_TAG = '#__EFROCACHE_STRIP_END__'
|
|
|
|
CACHE_DIR_NAME = '.efrocache'
|
|
CACHE_MAP_NAME = '.efrocachemap'
|
|
|
|
UPLOAD_STATE_CACHE_FILE = '.cache/efrocache_upload_state'
|
|
|
|
|
|
def get_file_hash(path: str) -> str:
|
|
"""Return the hash used for caching.
|
|
|
|
This incorporates the file contents as well as its path.
|
|
"""
|
|
import hashlib
|
|
md5 = hashlib.md5()
|
|
with open(path, 'rb') as infile:
|
|
md5.update(infile.read())
|
|
md5.update(path.encode())
|
|
return md5.hexdigest()
|
|
|
|
|
|
def get_target(path: str) -> None:
|
|
"""Fetch a target path from the cache, downloading if need be."""
|
|
from efro.error import CleanError
|
|
from efrotools import run
|
|
with open(CACHE_MAP_NAME) as infile:
|
|
efrocachemap = json.loads(infile.read())
|
|
if path not in efrocachemap:
|
|
raise RuntimeError(f'Path not found in efrocache: {path}')
|
|
url = efrocachemap[path]
|
|
subpath = '/'.join(url.split('/')[-3:])
|
|
local_cache_path = os.path.join(CACHE_DIR_NAME, subpath)
|
|
local_cache_path_dl = local_cache_path + '.download'
|
|
hashval = ''.join(subpath.split('/'))
|
|
|
|
# First off: if there's already a file in place, check its hash.
|
|
# If it matches the cache, we can just update its timestamp and
|
|
# call it a day.
|
|
if os.path.isfile(path):
|
|
existing_hash = get_file_hash(path)
|
|
if existing_hash == hashval:
|
|
os.utime(path, None)
|
|
print(f'Refreshing from cache: {path}')
|
|
return
|
|
|
|
# Ok there's not a valid file in place already.
|
|
# Clear out whatever is there to start with.
|
|
if os.path.exists(path):
|
|
os.unlink(path)
|
|
|
|
# Now if we don't have this entry in our local cache,
|
|
# download it.
|
|
if not os.path.exists(local_cache_path):
|
|
os.makedirs(os.path.dirname(local_cache_path), exist_ok=True)
|
|
print(f'Downloading: {Clr.SBLU}{path}{Clr.RST}')
|
|
result = subprocess.run(
|
|
f'curl --fail --silent {url} --output {local_cache_path_dl}',
|
|
shell=True,
|
|
check=False)
|
|
|
|
# We prune old cache files on the server, so its possible for one to
|
|
# be trying to build something the server can no longer provide.
|
|
# try to explain the situation.
|
|
if result.returncode == 22:
|
|
raise CleanError('Server gave an error.'
|
|
' Old build files may no longer be available;'
|
|
' make sure you are using a recent commit.')
|
|
if result.returncode != 0:
|
|
raise CleanError('Download failed; is your internet working?')
|
|
|
|
run(f'mv {local_cache_path_dl} {local_cache_path}')
|
|
|
|
# Ok we should have a valid .tar.gz file in our cache dir at this point.
|
|
# Just expand it and it get placed wherever it belongs.
|
|
print(f'Extracting: {path}')
|
|
run(f'tar -zxf {local_cache_path}')
|
|
|
|
# The file will wind up with the timestamp it was compressed with,
|
|
# so let's update its timestamp or else it will still be considered
|
|
# dirty.
|
|
run(f'touch {path}')
|
|
if not os.path.exists(path):
|
|
raise RuntimeError(f'File {path} did not wind up as expected.')
|
|
|
|
|
|
def filter_makefile(makefile_dir: str, contents: str) -> str:
|
|
"""Filter makefile contents to use efrocache lookups."""
|
|
|
|
if makefile_dir:
|
|
# Assuming just one level deep at the moment; can revisit later.
|
|
assert '/' not in makefile_dir
|
|
to_proj_root = '..'
|
|
else:
|
|
to_proj_root = ''
|
|
|
|
cachemap = os.path.join(to_proj_root, CACHE_MAP_NAME)
|
|
lines = contents.splitlines()
|
|
snippets = 'tools/snippets'
|
|
|
|
# Strip out parts they don't want.
|
|
while STRIP_BEGIN_TAG in lines:
|
|
index = lines.index(STRIP_BEGIN_TAG)
|
|
endindex = index
|
|
while lines[endindex] != STRIP_END_TAG:
|
|
endindex += 1
|
|
|
|
# If the line after us is blank, include it too to keep spacing clean.
|
|
if not lines[endindex + 1].strip():
|
|
endindex += 1
|
|
|
|
del lines[index:endindex + 1]
|
|
|
|
# Replace cachable targets with cache lookups
|
|
while TARGET_TAG in lines:
|
|
index = lines.index(TARGET_TAG)
|
|
endindex = index
|
|
while lines[endindex].strip() != '':
|
|
endindex += 1
|
|
tname = lines[index + 1].split(':')[0]
|
|
del lines[index:endindex]
|
|
lines.insert(index, tname + ': ' + cachemap)
|
|
target = (makefile_dir + '/' + '$@') if makefile_dir else '$@'
|
|
pre = f'cd {to_proj_root} && ' if makefile_dir else ''
|
|
lines.insert(index + 1, f'\t@{pre}{snippets} efrocache_get {target}')
|
|
return '\n'.join(lines) + '\n'
|
|
|
|
|
|
def update_cache(makefile_dirs: List[str]) -> None:
|
|
"""Given a list of directories containing Makefiles, update caches."""
|
|
|
|
import multiprocessing
|
|
cpus = multiprocessing.cpu_count()
|
|
fnames1: List[str] = []
|
|
fnames2: List[str] = []
|
|
for path in makefile_dirs:
|
|
cdp = f'cd {path} && ' if path else ''
|
|
|
|
# First, make sure all cache files are built.
|
|
mfpath = os.path.join(path, 'Makefile')
|
|
print(f'Building efrocache targets for {Clr.SBLU}{mfpath}{Clr.RST}...')
|
|
subprocess.run(f'{cdp}make -j{cpus} efrocache-build',
|
|
shell=True,
|
|
check=True)
|
|
|
|
rawpaths = subprocess.run(f'{cdp}make efrocache-list',
|
|
shell=True,
|
|
check=True,
|
|
capture_output=True).stdout.decode().split()
|
|
|
|
# Make sure the paths they gave were relative.
|
|
for rawpath in rawpaths:
|
|
if rawpath.startswith('/'):
|
|
raise RuntimeError(f'Invalid path returned for caching '
|
|
f'(absolute paths not allowed): {rawpath}')
|
|
|
|
# Break these into 2 lists, one of which will be included in the
|
|
# starter-cache.
|
|
for rawpath in rawpaths:
|
|
fullpath = os.path.join(path, rawpath)
|
|
|
|
# The main reason for this cache is to reduce round trips to
|
|
# the staging server for tiny files, so let's include small files
|
|
# only here. For larger stuff its ok to have a request per file..
|
|
if os.path.getsize(fullpath) < 100000:
|
|
fnames1.append(fullpath)
|
|
else:
|
|
fnames2.append(fullpath)
|
|
|
|
# Ok, we've got 2 lists of filenames that we need to cache in the cloud.
|
|
# First, however, let's look up modtimes for everything and if everything
|
|
# is exactly the same as last time we can skip this step.
|
|
hashes = _gen_hashes(fnames1 + fnames2)
|
|
if os.path.isfile(UPLOAD_STATE_CACHE_FILE):
|
|
with open(UPLOAD_STATE_CACHE_FILE) as infile:
|
|
hashes_existing = infile.read()
|
|
else:
|
|
hashes_existing = ''
|
|
if hashes == hashes_existing:
|
|
print(
|
|
f'{Clr.SBLU}Efrocache state unchanged;'
|
|
f' skipping cache push.{Clr.RST}',
|
|
flush=True)
|
|
else:
|
|
_upload_cache(fnames1, fnames2, hashes, hashes_existing)
|
|
|
|
print(f'{Clr.SBLU}Efrocache update successful!{Clr.RST}')
|
|
|
|
# Write the cache state so we can skip the next run if nothing changes.
|
|
os.makedirs(os.path.dirname(UPLOAD_STATE_CACHE_FILE), exist_ok=True)
|
|
with open(UPLOAD_STATE_CACHE_FILE, 'w') as outfile:
|
|
outfile.write(hashes)
|
|
|
|
|
|
def _upload_cache(fnames1: List[str], fnames2: List[str], hashes_str: str,
|
|
hashes_existing_str: str) -> None:
|
|
from efrotools import run
|
|
|
|
# First, if we've run before, print the files causing us to re-run:
|
|
if hashes_existing_str != '':
|
|
changed_files: Set[str] = set()
|
|
hashes = json.loads(hashes_str)
|
|
hashes_existing = json.loads(hashes_existing_str)
|
|
for fname, ftime in hashes.items():
|
|
if ftime != hashes_existing.get(fname, ''):
|
|
changed_files.add(fname)
|
|
|
|
# We've covered modifications and additions; add deletions:
|
|
for fname in hashes_existing:
|
|
if fname not in hashes:
|
|
changed_files.add(fname)
|
|
print(f'{Clr.SBLU}Updating efrocache due to'
|
|
f' {len(changed_files)} changes:{Clr.RST}')
|
|
for fname in sorted(changed_files):
|
|
print(f' {Clr.SBLU}{fname}{Clr.RST}')
|
|
|
|
# Now do the thing.
|
|
staging_dir = 'build/efrocache'
|
|
mapping_file = 'build/efrocachemap'
|
|
run(f'rm -rf {staging_dir}')
|
|
run(f'mkdir -p {staging_dir}')
|
|
|
|
_write_cache_files(fnames1, fnames2, staging_dir, mapping_file)
|
|
|
|
print(f'{Clr.SBLU}Starter cache includes {len(fnames1)} items;'
|
|
f' excludes {len(fnames2)}{Clr.RST}')
|
|
|
|
# Sync all individual cache files to the staging server.
|
|
print(f'{Clr.SBLU}Pushing cache to staging...{Clr.RST}', flush=True)
|
|
run('rsync --progress --recursive build/efrocache/'
|
|
' ubuntu@ballistica.net:files.ballistica.net/cache/ba1/')
|
|
|
|
# Now generate the starter cache on the server..
|
|
run('ssh -oBatchMode=yes -oStrictHostKeyChecking=yes ubuntu@ballistica.net'
|
|
' "cd files.ballistica.net/cache/ba1 && python3 genstartercache.py"')
|
|
|
|
|
|
def _gen_hashes(fnames: List[str]) -> str:
|
|
import hashlib
|
|
|
|
def _get_file_hash(fname: str) -> Tuple[str, str]:
|
|
md5 = hashlib.md5()
|
|
with open(fname, mode='rb') as infile:
|
|
md5.update(infile.read())
|
|
return fname, md5.hexdigest()
|
|
|
|
# Now use all procs to hash the files efficiently.
|
|
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
|
|
hashes = dict(executor.map(_get_file_hash, fnames))
|
|
|
|
return json.dumps(hashes, separators=(',', ':'))
|
|
|
|
|
|
def _write_cache_files(fnames1: List[str], fnames2: List[str],
|
|
staging_dir: str, mapping_file: str) -> None:
|
|
import functools
|
|
fhashes1: Set[str] = set()
|
|
fhashes2: Set[str] = set()
|
|
mapping: Dict[str, str] = {}
|
|
call = functools.partial(_write_cache_file, staging_dir)
|
|
|
|
# Do the first set.
|
|
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
|
|
results = executor.map(call, fnames1)
|
|
for result in results:
|
|
mapping[result[0]] = BASE_URL + result[1]
|
|
fhashes1.add(result[1])
|
|
|
|
# Now finish up with the second set.
|
|
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
|
|
results = executor.map(call, fnames2)
|
|
for result in results:
|
|
mapping[result[0]] = BASE_URL + result[1]
|
|
fhashes2.add(result[1])
|
|
|
|
# We want the server to have a startercache.tar.xz file which contains
|
|
# the entire first set. It is much more efficient to build that file
|
|
# on the server than it is to build it here and upload the whole thing.
|
|
# ...so let's simply write a script to generate it and upload that.
|
|
|
|
# Also let's have the script touch both sets of files so we can use
|
|
# mod-times to prune older files. (otherwise files that never change
|
|
# might have very old mod times)
|
|
script = (
|
|
'import os\n'
|
|
'import pathlib\n'
|
|
'import subprocess\n'
|
|
'fnames = ' + repr(fhashes1) + '\n'
|
|
'fnames2 = ' + repr(fhashes2) + '\n'
|
|
'subprocess.run(["rm", "-rf", "efrocache"], check=True)\n'
|
|
'print("Copying starter cache files...", flush=True)\n'
|
|
'for fname in fnames:\n'
|
|
' dst = os.path.join("efrocache", fname)\n'
|
|
' os.makedirs(os.path.dirname(dst), exist_ok=True)\n'
|
|
' subprocess.run(["cp", fname, dst], check=True)\n'
|
|
'print("Touching full file set...", flush=True)\n'
|
|
'for fname in list(fnames) + list(fnames2):\n'
|
|
' fpath = pathlib.Path(fname)\n'
|
|
' assert fpath.exists()\n'
|
|
' fpath.touch()\n'
|
|
'print("Compressing starter cache archive...", flush=True)\n'
|
|
'subprocess.run(["tar", "-Jcf", "tmp.tar.xz", "efrocache"],'
|
|
' check=True)\n'
|
|
'subprocess.run(["mv", "tmp.tar.xz", "startercache.tar.xz"],'
|
|
' check=True)\n'
|
|
'subprocess.run(["rm", "-rf", "efrocache", "genstartercache.py"])\n'
|
|
'print("Starter cache generation complete!", flush=True)\n')
|
|
|
|
with open('build/efrocache/genstartercache.py', 'w') as outfile:
|
|
outfile.write(script)
|
|
|
|
with open(mapping_file, 'w') as outfile:
|
|
outfile.write(json.dumps(mapping, indent=2, sort_keys=True))
|
|
|
|
|
|
def _write_cache_file(staging_dir: str, fname: str) -> Tuple[str, str]:
|
|
import hashlib
|
|
from efrotools import run
|
|
print(f'Caching {fname}')
|
|
if ' ' in fname:
|
|
raise RuntimeError('Spaces in paths not supported.')
|
|
|
|
# Just going with ol' md5 here; we're the only ones creating these so
|
|
# security isn't a concern.
|
|
md5 = hashlib.md5()
|
|
with open(fname, 'rb') as infile:
|
|
md5.update(infile.read())
|
|
md5.update(fname.encode())
|
|
finalhash = md5.hexdigest()
|
|
hashpath = os.path.join(finalhash[:2], finalhash[2:4], finalhash[4:])
|
|
path = os.path.join(staging_dir, hashpath)
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
|
|
# Fancy pipe stuff which will give us deterministic tar.gz files
|
|
# with no embedded timestamps.
|
|
# Note: The 'COPYFILE_DISABLE' prevents mac tar from adding
|
|
# file attributes/resource-forks to the archive as as ._filename.
|
|
run(f'COPYFILE_DISABLE=1 tar cf - {fname} | gzip -n > {path}')
|
|
return fname, hashpath
|
|
|
|
|
|
def _check_warm_start_entry(entry: Tuple[str, str]) -> None:
|
|
import hashlib
|
|
fname, filehash = entry
|
|
md5 = hashlib.md5()
|
|
with open(fname, 'rb') as infile:
|
|
md5.update(infile.read())
|
|
md5.update(fname.encode())
|
|
finalhash = md5.hexdigest()
|
|
|
|
# If the file still matches the hash value we have for it,
|
|
# go ahead and update its timestamp.
|
|
if finalhash == filehash:
|
|
os.utime(fname, None)
|
|
|
|
|
|
def _check_warm_start_entries(entries: List[Tuple[str, str]]) -> None:
|
|
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
|
|
# Converting this to a list pulls results and propagates errors)
|
|
list(executor.map(_check_warm_start_entry, entries))
|
|
|
|
|
|
def warm_start_cache() -> None:
|
|
"""Run a pre-pass on the efrocache to improve efficiency."""
|
|
from efrotools import run
|
|
|
|
# We maintain a starter-cache on the staging server, which
|
|
# is simply the latest set of cache entries compressed into a single
|
|
# compressed archive. If we have no local cache yet we can download
|
|
# and expand this to give us a nice head start and greatly reduce
|
|
# the initial set of individual files we have to fetch.
|
|
# (downloading a single compressed archive is much more efficient than
|
|
# downloading thousands)
|
|
if not os.path.exists(CACHE_DIR_NAME):
|
|
print('Downloading asset starter-cache...', flush=True)
|
|
run(f'curl --fail {BASE_URL}startercache.tar.xz'
|
|
f' --output startercache.tar.xz')
|
|
print('Decompressing starter-cache...', flush=True)
|
|
run('tar -xf startercache.tar.xz')
|
|
run(f'mv efrocache {CACHE_DIR_NAME}')
|
|
run('rm startercache.tar.xz')
|
|
print('Starter-cache fetched successfully!'
|
|
' (should speed up asset builds)')
|
|
|
|
# In the public build, let's scan through all files managed by
|
|
# efrocache and update any with timestamps older than the latest
|
|
# cache-map that we already have the data for.
|
|
# Otherwise those files will update individually the next time
|
|
# they are 'built'. Even though that only takes a fraction of a
|
|
# second per file, it adds up when done for thousands of assets
|
|
# each time the cache map changes. It is much more efficient to do
|
|
# it in one go here.
|
|
cachemap: Dict[str, str]
|
|
with open(CACHE_MAP_NAME) as infile:
|
|
cachemap = json.loads(infile.read())
|
|
assert isinstance(cachemap, dict)
|
|
cachemap_mtime = os.path.getmtime(CACHE_MAP_NAME)
|
|
entries: List[Tuple[str, str]] = []
|
|
for fname, url in cachemap.items():
|
|
|
|
# File hasn't been pulled from cache yet = ignore.
|
|
if not os.path.exists(fname):
|
|
continue
|
|
|
|
# File is newer than the cache map = ignore.
|
|
if cachemap_mtime < os.path.getmtime(fname):
|
|
continue
|
|
|
|
# Don't have the cache source file for this guy = ignore.
|
|
cachefile = CACHE_DIR_NAME + '/' + '/'.join(url.split('/')[-3:])
|
|
if not os.path.exists(cachefile):
|
|
continue
|
|
|
|
# Ok, add it to the list of files we can potentially update timestamps
|
|
# on once we check its hash.
|
|
filehash = ''.join(url.split('/')[-3:])
|
|
entries.append((fname, filehash))
|
|
|
|
if entries:
|
|
# Now fire off a multithreaded executor to check hashes and update
|
|
# timestamps.
|
|
_check_warm_start_entries(entries)
|