Skip to content

Commit 7e8d638

Browse files
committed
Add levels --stats subcommand
[skip ci]
1 parent ceb09ae commit 7e8d638

3 files changed

Lines changed: 135 additions & 89 deletions

File tree

‎scripts/gdrive_cache.py‎

Lines changed: 56 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pathlib import Path
1010
from typing import Callable, Optional
1111
from tqdm import tqdm
12-
import gdrive_base
12+
import sys
1313
import gdrive
1414
import shutil
1515
from strutils import (
@@ -168,7 +168,7 @@ def find_unlinked_tag_content(
168168
) -> list[dict]:
169169
ret = []
170170
for tag in website.tags:
171-
all_files = find_files_for_tag(tag.slug, include_folders)
171+
all_files = find_files_for_tag(tag.slug, include_av=include_av, include_folders=include_folders)
172172
all_files = [
173173
file for file in all_files
174174
if file['id'] not in website.data.linked_ids
@@ -426,68 +426,7 @@ def download_file_to_cache(file: dict, verbose=False) -> str | None:
426426
if file['id'] in SEEN_IDS:
427427
return None
428428
SEEN_IDS.add(file['id'])
429-
if file['mimeType'] == 'application/vnd.google-apps.shortcut':
430-
tfile = gdrive.gcache.get_item(file['shortcutDetails']['targetId'])
431-
if not tfile:
432-
print(f"WARNING: Skipping dangling shortcut \"{file['name']}\" in {file['parent_id']}")
433-
return None
434-
file = tfile
435-
436-
is_gdoc = file['mimeType'] == 'application/vnd.google-apps.document'
437-
is_gsheet = file['mimeType'] == 'application/vnd.google-apps.spreadsheet'
438-
if is_gdoc or is_gsheet:
439-
hashval = md5(file['id'] + str(file['version']))
440-
else:
441-
hashval = file.get('md5Checksum')
442-
if not isinstance(hashval, str):
443-
return None
444-
assert len(hashval) == 32
445-
446-
if is_gdoc:
447-
extension = '.docx'
448-
elif is_gsheet:
449-
extension = '.xlsx'
450-
elif '.' in file.get('name', ''):
451-
extension = '.' + str(file['name']).split('.')[-1].lower()
452-
else:
453-
extension = guess_extension(file['mimeType']) or ''
454-
455-
cache_dir = gdrive.gcache.file_cache_dir
456-
assert isinstance(cache_dir, Path)
457-
target_path = cache_dir / hashval[:2] / f"{hashval[2:]}{extension}"
458-
if target_path.exists():
459-
if verbose:
460-
print(f" Skipping already downloaded {file['name']}")
461-
return str(target_path)
462-
target_path.parent.mkdir(exist_ok=True)
463-
if verbose:
464-
print(f" Downloading {file['name']}")
465-
if not (is_gdoc or is_gsheet):
466-
try:
467-
gdrive_base.download_file(file['id'], target_path, verbose=False)
468-
except FileNotFoundError as e:
469-
if target_path.exists():
470-
# Another thread got this file before us 😅
471-
pass
472-
else:
473-
raise e
474-
else:
475-
try:
476-
if is_gdoc:
477-
gdrive_base.download_gdoc_as_docx(file['id'], target_path)
478-
elif is_gsheet:
479-
gdrive_base.download_gsheet_as_xlsx(file['id'], target_path)
480-
except gerrors.HttpError as e:
481-
if "exportSizeLimitExceeded" in str(e):
482-
if verbose:
483-
print(f" Skipping {file['name']}: it's too large to be exported :(")
484-
return None
485-
if "cannot be exported" in str(e):
486-
return None
487-
raise e
488-
if verbose:
489-
print(f" Saved to {target_path.parent.name}/{target_path.name}")
490-
return str(target_path)
429+
return gdrive.gcache.download_file_to_cache(file)
491430

492431
def run_backup_level(level: BackupLevel, parallelism=14):
493432
print(f"Starting backup level {level.level} ({level.name})...")
@@ -610,7 +549,11 @@ def backup_main(from_level: int=0, new_max_level: int | None=None, parallelism:
610549
run_backup_level(level, parallelism=parallelism)
611550
print(f"All files with priority <= {max_level} are now saved locally!")
612551

613-
def print_backup_levels_list():
552+
def format_cache_percentage(dl_size: int, total_size: int) -> str:
553+
return f"{(float(dl_size)/total_size):.1%} ({format_size(dl_size)}/{format_size(total_size)})"
554+
555+
def print_backup_levels_list(statistics: bool=False):
556+
"""`statistics` replaces the generic description with current fill level stats"""
614557
print("\033[1mGoogle Drive Backup Levels\033[0m")
615558
print(
616559
"""
@@ -623,24 +566,45 @@ def print_backup_levels_list():
623566
)
624567
seen_file_ids = set()
625568
cum_sum_size = 0
569+
cum_sum_dl_size = 0
626570

627-
print(f"\033[4m Lvl: {'Level Name':<16}{'Est. Size':>9} - {'Description'}\033[0m")
571+
if statistics:
572+
print(f"\033[4m Lvl: {'Level Name':<16}{'This Level':^25} {'Cummulative':^24}\033[0m")
573+
else:
574+
print(f"\033[4m Lvl: {'Level Name':<16}{'Est. Size':>9} - {'Description'}\033[0m")
628575
for lvl, bl in BACKUP_LEVELS.items():
629576
if bl.finder is None:
630-
print(f"\033[1m {lvl:3d}: {'^'+bl.name+'^':<16}{"\"" if bl.level else "0 B":^9} - {bl.description}\033[0m")
577+
print(f"\033[1m {lvl:3d}: {'^'+bl.name+'^':<16}{"\"" if (bl.level or statistics) else "0 B":^9} - {bl.description if not statistics else ''}\033[0m")
631578
else:
632579
files = bl.finder()
633580
this_level_inc_size = 0
634581
this_level_overlap_size = 0
582+
this_level_overlap_dl_size = 0
583+
this_level_inc_dl_size = 0
635584
while files:
636585
file = files.pop()
586+
target_path = gdrive.gcache.get_cache_path_for_file(file)
587+
if not target_path:
588+
continue
637589
if file['id'] in seen_file_ids:
638590
this_level_overlap_size += file['size']
591+
if statistics and target_path.exists():
592+
this_level_overlap_dl_size += file['size']
639593
else:
640594
this_level_inc_size += file['size']
641595
seen_file_ids.add(file['id'])
596+
if statistics and target_path.exists():
597+
this_level_inc_dl_size += file['size']
642598
cum_sum_size += this_level_inc_size
643-
print(f" {lvl:3d}: {bl.name:<16}{format_size(cum_sum_size):>9} - {bl.description}")
599+
cum_sum_dl_size += this_level_inc_dl_size
600+
if statistics:
601+
if this_level_inc_size+this_level_overlap_size > 0:
602+
description = f"{format_cache_percentage(this_level_inc_dl_size+this_level_overlap_dl_size, this_level_inc_size+this_level_overlap_size):<25} {format_cache_percentage(cum_sum_dl_size, cum_sum_size):^24}"
603+
else:
604+
description = f" [N/A]"
605+
else:
606+
description = f"{format_size(cum_sum_size):>9} - {bl.description}"
607+
print(f" {lvl:3d}: {bl.name:<16}{description}")
644608

645609
if __name__ == "__main__":
646610
import argparse
@@ -657,6 +621,14 @@ def backup_level(value):
657621
raise argparse.ArgumentTypeError(f"{ivalue} is too large to be a valid backup level. Use --list-levels to see valid levels and their meaning.")
658622
return ivalue
659623

624+
levels = subparsers.add_parser("levels", help="Print information about the backup levels")
625+
levels.add_argument(
626+
'--stats',
627+
action="store_true",
628+
default=False,
629+
help="Instead of the description, print stats about the cache at each level"
630+
)
631+
660632
backup = subparsers.add_parser("backup", help="Download files from Drive to the cache")
661633
backup.add_argument(
662634
"--to-level", '-l',
@@ -674,11 +646,6 @@ def backup_level(value):
674646
type=int,
675647
help="Skip levels less than this (this run only)",
676648
)
677-
backup.add_argument(
678-
"--list-levels",
679-
action="store_true",
680-
help="List all available backup levels and exit",
681-
)
682649
backup.add_argument(
683650
"--threads", "-t",
684651
required=False,
@@ -723,20 +690,22 @@ def backup_level(value):
723690

724691
if args.command == "sideload":
725692
sideload_main(args.files, args.parent_folder, move=(not args.copy), recurse=args.recursive, check=args.replace)
693+
sys.exit(0)
694+
695+
with yaspin(text="Loading website data..."):
696+
website.load()
697+
website.data.linked_ids = set()
698+
for item in website.content:
699+
if not item.get('external_url') and not item.get('file_links'):
700+
continue
701+
for drive_link in item.get('drive_links', []):
702+
website.data.linked_ids.add(
703+
gdrive.link_to_id(drive_link)
704+
)
705+
706+
if args.command == "levels":
707+
print_backup_levels_list(statistics=args.stats)
726708
elif args.command == "backup":
727-
with yaspin(text="Loading website data..."):
728-
website.load()
729-
website.data.linked_ids = set()
730-
for item in website.content:
731-
if not item.get('external_url') and not item.get('file_links'):
732-
continue
733-
for drive_link in item.get('drive_links', []):
734-
website.data.linked_ids.add(
735-
gdrive.link_to_id(drive_link)
736-
)
737-
if args.list_levels:
738-
print_backup_levels_list()
739-
else:
740-
backup_main(from_level=args.from_level, new_max_level=args.level, parallelism=args.threads)
709+
backup_main(from_level=args.from_level, new_max_level=args.level, parallelism=args.threads)
741710
else:
742711
parser.print_help()

‎scripts/local_gdrive.py‎

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from time import sleep
88
from mimetypes import guess_extension
99

10+
import googleapiclient.errors as gerrors
1011
import gdrive_base
1112

1213
from yaspin import yaspin
@@ -19,6 +20,7 @@
1920
from strutils import (
2021
input_with_prefill,
2122
prompt,
23+
md5,
2224
)
2325

2426
def UTC_NOW():
@@ -691,6 +693,81 @@ def find_duplicate_md5s(self) -> List[str]:
691693
"""
692694
self.cursor.execute(sql)
693695
return [row['md5_checksum'] for row in self.cursor.fetchall()]
696+
697+
def get_cache_path_for_file(self, file: dict) -> Path | None:
698+
"""Tells where the file would be. It may or may not be there"""
699+
if not isinstance(self.file_cache_dir, Path):
700+
return None
701+
702+
if 'trashed_time' in file:
703+
rm_date = file['trashed_time'] or file['modifiedTime']
704+
rm_date = datetime.fromisoformat(rm_date)
705+
return self.file_cache_dir / 'trash' / str(rm_date.year) / f"{rm_date.month:02d}" / file['name']
706+
707+
is_gdoc = file['mimeType'] == 'application/vnd.google-apps.document'
708+
is_gsheet = file['mimeType'] == 'application/vnd.google-apps.spreadsheet'
709+
if is_gdoc or is_gsheet:
710+
hashval = md5(file['id'] + str(file['version']))
711+
else:
712+
hashval = file.get('md5Checksum')
713+
if not isinstance(hashval, str):
714+
return None
715+
assert len(hashval) == 32
716+
717+
if is_gdoc:
718+
extension = '.docx'
719+
elif is_gsheet:
720+
extension = '.xlsx'
721+
elif '.' in file.get('name', ''):
722+
extension = '.' + str(file['name']).split('.')[-1].lower()
723+
else:
724+
extension = guess_extension(file['mimeType']) or ''
725+
726+
return self.file_cache_dir / hashval[:2] / f"{hashval[2:]}{extension}"
727+
728+
def download_file_to_cache(self, file: dict, verbose: bool=False) -> Path | None:
729+
if file['mimeType'] == 'application/vnd.google-apps.shortcut':
730+
tfile = self.get_item(file['shortcutDetails']['targetId'])
731+
if not tfile:
732+
print(f"WARNING: Skipping dangling shortcut \"{file['name']}\" in {file['parent_id']}")
733+
return None
734+
file = tfile
735+
target_path = self.get_cache_path_for_file(file)
736+
if not target_path:
737+
return None
738+
if target_path.exists():
739+
if verbose:
740+
print(f" Skipping already downloaded {file['name']}")
741+
return str(target_path)
742+
target_path.parent.mkdir(exist_ok=True)
743+
if verbose:
744+
print(f" Downloading {file['name']}")
745+
if not file['mimeType'].startswith('application/vnd.google-apps'):
746+
try:
747+
gdrive_base.download_file(file['id'], target_path, verbose=verbose)
748+
except FileNotFoundError as e:
749+
if target_path.exists():
750+
# Another thread got this file before us 😅
751+
pass
752+
else:
753+
raise e
754+
else:
755+
try:
756+
if file['mimeType'] == 'application/vnd.google-apps.document':
757+
gdrive_base.download_gdoc_as_docx(file['id'], target_path)
758+
elif file['mimeType'] == 'application/vnd.google-apps.spreadsheet':
759+
gdrive_base.download_gsheet_as_xlsx(file['id'], target_path)
760+
except gerrors.HttpError as e:
761+
if "exportSizeLimitExceeded" in str(e):
762+
if verbose:
763+
print(f" Skipping {file['name']}: it's too large to be exported :(")
764+
return None
765+
if "cannot be exported" in str(e):
766+
return None
767+
raise e
768+
if verbose:
769+
print(f" Saved to {target_path}")
770+
return str(target_path)
694771

695772
def get_cache_path_for_md5(self, hashval: str) -> Path | None:
696773
"""Returns None if the hashval is unknown to me"""

‎scripts/strutils.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -484,9 +484,9 @@ def format_size(size_in_bytes):
484484
size_in_bytes /= 1024
485485
for unit in ['KB', 'MB', 'GB', 'TB']:
486486
if size_in_bytes < 1000:
487-
return f"{size_in_bytes:.2f} {unit}"
487+
return f"{size_in_bytes:.1f} {unit}"
488488
size_in_bytes /= 1024
489-
return f"{size_in_bytes:.2f} PB"
489+
return f"{size_in_bytes:.1f} PB"
490490

491491
def write_frontmatter_key(path: Path, key: str, value, insert_after_key=None):
492492
"""Takes a markdown file and top-level frontmatter key and sets it to value

0 commit comments

Comments
 (0)