simplifying .efrocachemap a bit further from relative paths into just hashes

This commit is contained in:
Eric 2023-07-28 16:12:49 -07:00
parent a6246b9eb7
commit 9f660dd056
No known key found for this signature in database
GPG Key ID: 89C93F0F8D6D5A98
5 changed files with 4148 additions and 4136 deletions

8248
.efrocachemap generated

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,9 @@
<w>aaak</w> <w>aaak</w>
<w>aarch</w> <w>aarch</w>
<w>aate</w> <w>aate</w>
<w>abcdefghijkl</w>
<w>abcdefghijklmnopqrstuvwxyz</w> <w>abcdefghijklmnopqrstuvwxyz</w>
<w>abcdefghijlk</w>
<w>abcdefghjkmnpqrtuvwxy</w> <w>abcdefghjkmnpqrtuvwxy</w>
<w>abeb</w> <w>abeb</w>
<w>abishort</w> <w>abishort</w>
@ -856,6 +858,7 @@
<w>editorconfig</w> <w>editorconfig</w>
<w>efca</w> <w>efca</w>
<w>effmult</w> <w>effmult</w>
<w>efghijkl</w>
<w>efgrd</w> <w>efgrd</w>
<w>efile</w> <w>efile</w>
<w>efro</w> <w>efro</w>
@ -3263,6 +3266,7 @@
<w>wreadlink</w> <w>wreadlink</w>
<w>wref</w> <w>wref</w>
<w>writeauxiliaryfile</w> <w>writeauxiliaryfile</w>
<w>writecall</w>
<w>writeclasses</w> <w>writeclasses</w>
<w>writefuncs</w> <w>writefuncs</w>
<w>wslpath</w> <w>wslpath</w>

View File

@ -10,6 +10,8 @@
could set up a server that never gets pruned and contains all history from now could set up a server that never gets pruned and contains all history from now
until forever. Efrocache is basically just a big pile of files organized by until forever. Efrocache is basically just a big pile of files organized by
their hashes (see `tools/efrotools/efrocache.py` for details). their hashes (see `tools/efrotools/efrocache.py` for details).
- On a related note, the .efrocachemap file now just contains hashes instead of
full urls per file (which were based on those hashes anyway).
- The default efrocache file location is now `.cache/efrocache` instead of - The default efrocache file location is now `.cache/efrocache` instead of
`.efrocache`. Feel free to blow away any `.efrocache` dir if you still have `.efrocache`. Feel free to blow away any `.efrocache` dir if you still have
one (or move it to the new path to avoid having to download things again). one (or move it to the new path to avoid having to download things again).

View File

@ -6,7 +6,9 @@
<w>aaaand</w> <w>aaaand</w>
<w>aabb</w> <w>aabb</w>
<w>aate</w> <w>aate</w>
<w>abcdefghijkl</w>
<w>abcdefghijklmnopqrstuvwxyz</w> <w>abcdefghijklmnopqrstuvwxyz</w>
<w>abcdefghijlk</w>
<w>abcdefghjkmnpqrtuvwxy</w> <w>abcdefghjkmnpqrtuvwxy</w>
<w>abouttab</w> <w>abouttab</w>
<w>absval</w> <w>absval</w>
@ -526,6 +528,7 @@
<w>edef</w> <w>edef</w>
<w>efca</w> <w>efca</w>
<w>effmult</w> <w>effmult</w>
<w>efghijkl</w>
<w>efro</w> <w>efro</w>
<w>efrocaching</w> <w>efrocaching</w>
<w>efrohack</w> <w>efrohack</w>
@ -1900,6 +1903,7 @@
<w>wprjp</w> <w>wprjp</w>
<w>wreadlink</w> <w>wreadlink</w>
<w>writeauxiliaryfile</w> <w>writeauxiliaryfile</w>
<w>writecall</w>
<w>wspath</w> <w>wspath</w>
<w>wsroot</w> <w>wsroot</w>
<w>wtfslice</w> <w>wtfslice</w>

View File

@ -128,18 +128,20 @@ def get_target(path: str) -> None:
efrocachemap = json.loads(infile.read()) efrocachemap = json.loads(infile.read())
if path not in efrocachemap: if path not in efrocachemap:
raise RuntimeError(f'Path not found in efrocache: {path}') raise RuntimeError(f'Path not found in efrocache: {path}')
relurl = efrocachemap[path]
# These used to be abs paths but are now relative. hashval = efrocachemap[path]
assert not relurl.startswith('https:')
assert not relurl.startswith('/') # These used to be url paths but now they're just hashes.
assert not hashval.startswith('https:')
assert '/' not in hashval
# If our hash is 'abcdefghijkl', our subpath is 'ab/cd/efghijkl'.
subpath = '/'.join([hashval[:2], hashval[2:4], hashval[4:]])
repo = get_repository_base_url() repo = get_repository_base_url()
url = f'{repo}/{relurl}' url = f'{repo}/{subpath}'
subpath = '/'.join(url.split('/')[-3:])
local_cache_path = os.path.join(local_cache_dir, subpath) local_cache_path = os.path.join(local_cache_dir, subpath)
hashval = ''.join(subpath.split('/'))
# First off: if there's already a cache file in place, check its # First off: if there's already a cache file in place, check its
# hash. If its calced hash matches its path, we can just update its # hash. If its calced hash matches its path, we can just update its
@ -436,7 +438,7 @@ def _write_cache_files(
for result in results: for result in results:
# mapping[result[0]] = f'{base_url}/{result[1]}' # mapping[result[0]] = f'{base_url}/{result[1]}'
mapping[result[0]] = result[1] mapping[result[0]] = result[1]
fhashes1.add(result[1]) fhashes1.add(result[2])
# Now finish up with the second set. # Now finish up with the second set.
with ThreadPoolExecutor(max_workers=cpu_count()) as executor: with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
@ -444,7 +446,7 @@ def _write_cache_files(
for result in results: for result in results:
# mapping[result[0]] = f'{base_url}/result[1]' # mapping[result[0]] = f'{base_url}/result[1]'
mapping[result[0]] = result[1] mapping[result[0]] = result[1]
fhashes2.add(result[1]) fhashes2.add(result[2])
# We want the server to have a startercache.tar.xz file which # We want the server to have a startercache.tar.xz file which
# contains the entire first set. It is much more efficient to build # contains the entire first set. It is much more efficient to build
@ -490,7 +492,7 @@ def _write_cache_files(
outfile.write(json.dumps(mapping, indent=2, sort_keys=True)) outfile.write(json.dumps(mapping, indent=2, sort_keys=True))
def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str]: def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str, str]:
import hashlib import hashlib
print(f'Caching {fname}') print(f'Caching {fname}')
@ -505,7 +507,7 @@ def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str]:
# lots of existing files when seeing if they need to be updated. # lots of existing files when seeing if they need to be updated.
# Just going with ol' md5 here; we're the only ones creating these # Just going with ol' md5 here; we're the only ones creating these
# so security isn't a concern. # so security isn't a concern currently.
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update(prefix + fdataraw) md5.update(prefix + fdataraw)
finalhash = md5.hexdigest() finalhash = md5.hexdigest()
@ -516,7 +518,7 @@ def _write_cache_file(staging_dir: str, fname: str) -> tuple[str, str]:
with open(path, 'wb') as outfile: with open(path, 'wb') as outfile:
outfile.write(prefix + zlib.compress(fdataraw)) outfile.write(prefix + zlib.compress(fdataraw))
return fname, hashpath return fname, finalhash, hashpath
def _cache_prefix_for_file(fname: str) -> bytes: def _cache_prefix_for_file(fname: str) -> bytes: