Skip to content

Commit 134b8df

Browse files
committed
Add backup level 1
[skip ci]
1 parent 82b8b7b commit 134b8df

3 files changed

Lines changed: 147 additions & 21 deletions

File tree

‎scripts/gdrive_cache.py‎

Lines changed: 144 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/bin/python3
22

3+
from enum import StrEnum
34
from collections.abc import Collection
45
from collections import OrderedDict, deque
56
import random
@@ -12,6 +13,7 @@
1213
import shutil
1314
from strutils import (
1415
md5,
16+
format_size,
1517
)
1618
from executils import graceful_threadmap
1719
from yaspin import yaspin
@@ -71,7 +73,87 @@ def decorator(func: Callable[[], list[dict]]):
7173

7274
add_backup_level(0, "Cache Only", "Don't proactively fill the cache at all")
7375

74-
# @backup_level(1, "lone published", "Non-video files included but without a website source")
76+
class TagFolderTypes(StrEnum):
77+
PUBLIC = 'public'
78+
PRIVATE = 'private'
79+
NONTAG_PUBLIC_SUBS = 'public/*'
80+
UNREAD = 'private/unread'
81+
ARCHIVE = 'private/archive'
82+
83+
def get_folders_of_types_for_tag(tag: website.TagFile, include_folders: Collection[TagFolderTypes]) -> list[dict]:
84+
folder_json = gdrive.FOLDERS_DATA()
85+
if tag.slug not in folder_json:
86+
return []
87+
private_link = folder_json[tag.slug]['private']
88+
public_link = folder_json[tag.slug]['public']
89+
public_id = gdrive.folderlink_to_id(public_link) if public_link else None
90+
private_id = gdrive.folderlink_to_id(private_link) if private_link else None
91+
ret = []
92+
if private_id:
93+
private_subfolders = gdrive.gcache.get_subfolders(
94+
parent_id=private_id,
95+
include_shortcuts=False,
96+
)
97+
else:
98+
private_subfolders = []
99+
for inc_type in set(include_folders):
100+
match inc_type:
101+
case TagFolderTypes.PUBLIC:
102+
if public_id:
103+
ret.append(gdrive.gcache.get_item(public_id))
104+
case TagFolderTypes.PRIVATE:
105+
if private_id:
106+
ret.append(gdrive.gcache.get_item(private_id))
107+
case TagFolderTypes.NONTAG_PUBLIC_SUBS:
108+
if public_id:
109+
folderids_to_tag = gdrive.load_folder_slugs()
110+
for sf in gdrive.gcache.get_subfolders(parent_id=public_id, include_shortcuts=False):
111+
if sf['id'] not in folderids_to_tag:
112+
ret.append(sf)
113+
case TagFolderTypes.UNREAD:
114+
for sf in private_subfolders:
115+
if 'unread' in sf['name'].lower():
116+
ret.append(sf)
117+
case TagFolderTypes.ARCHIVE:
118+
for sf in private_subfolders:
119+
if sf['name'].lower().startswith('archive'):
120+
ret.append(sf)
121+
return ret
122+
123+
@backup_level(1, "lone published", "Non-av files in tags without a source")
124+
def find_unlinked_tag_content(
125+
include_folders: set[TagFolderTypes]={
126+
TagFolderTypes.PUBLIC,
127+
TagFolderTypes.NONTAG_PUBLIC_SUBS,
128+
TagFolderTypes.PRIVATE,
129+
},
130+
include_av=False
131+
) -> list[dict]:
132+
ret = []
133+
for tag in website.tags:
134+
folders = get_folders_of_types_for_tag(tag, include_folders)
135+
if not folders:
136+
continue
137+
nqs = ['?'] * len(folders)
138+
nqs = ', '.join(nqs)
139+
bad_mime_prefixes = ['application/vnd.google-apps']
140+
if not include_av:
141+
bad_mime_prefixes.extend([
142+
'audio/',
143+
'video/',
144+
'application/zip',
145+
])
146+
bad_mime_prefixes = [
147+
f"mime_type LIKE '{prefix}%'"
148+
for prefix in bad_mime_prefixes
149+
]
150+
all_files = gdrive.gcache.sql_query(
151+
f"parent_id IN ({nqs}) AND NOT ({' OR '.join(bad_mime_prefixes)})",
152+
tuple(sf['id'] for sf in folders),
153+
)
154+
random.shuffle(all_files)
155+
ret.extend(all_files)
156+
return ret
75157

76158
add_backup_level(10, "High", "All valuable items in need of backing up")
77159
add_backup_level(30, "Medium", "All items in active need of backing up")
@@ -92,17 +174,17 @@ def find_eks_files() -> list[str]:
92174
def find_academia_edu_pdfs() -> list[dict]:
93175
return query_parent_name(BULK_PDF_FOLDER_NAMES[BulkPDFType.ACADEMIA_EDU])
94176

95-
add_backup_level(60, "Low", "All valuable items, including those backed up elsewhere")
177+
add_backup_level(60, "Low", "All valuable items, even backed up elsewhere")
96178

97-
@backup_level(66, 'core api pdfs', "The PDFs pulled from CORE yet unsorted")
179+
@backup_level(66, 'core api', "The PDFs pulled from CORE yet unsorted")
98180
def find_unsorted_core_pdfs() -> list[dict]:
99181
return query_parent_name(BULK_PDF_FOLDER_NAMES[BulkPDFType.CORE_API])
100182

101-
@backup_level(72, 'rejects', "Saves files actively rejected from the library")
183+
@backup_level(72, 'rejects', "actively rejected from the library")
102184
def find_rejected_files() -> list[dict]:
103185
return query_my_cache("parent_id = ?", (gdrive.REJECTS_FOLDER_ID,))
104186

105-
@backup_level(78, "all OBU files", "Attempts to save every descendant of the library roots")
187+
@backup_level(78, "all obu files", "every descendant of the library roots")
106188
def find_all_obu_files(filter_fn: Callable[[dict], bool]=None) -> list[dict]:
107189
ret = []
108190
folders_data = gdrive.FOLDERS_DATA()
@@ -148,6 +230,7 @@ def find_all_obu_pdfs() -> list[dict]:
148230
def list_one_off_docs() -> list[dict]:
149231
return [
150232
gdrive.gcache.get_item(fid) for fid in [
233+
'1TN6KzqD7-dEwcEJ9cs9qykuUHT_QRMpm7TVD8cT_biU',
151234
'1NNlHLr928Mb-NRiJKjZxdwTrsYY7cSZdR_3KulvjiYA',
152235
'1Yi6evYG0NsdYzVBO7o8XrCIHo3dApJ4DmlXraerzZFw',
153236
]
@@ -181,7 +264,7 @@ def find_all_obu_text_docs() -> list[dict]:
181264
def find_all_obu_audio_files() -> list[dict]:
182265
return find_all_obu_files(lambda f: f['mimeType'].startswith('audio'))
183266

184-
@backup_level(84, "google docs and sheets", "My manually created docs")
267+
@backup_level(84, "google docs", "My manually created Docs and Sheets")
185268
def find_manual_docs() -> list[dict]:
186269
with gdrive.gcache._lock:
187270
ret = gdrive.gcache.cursor.execute(
@@ -202,21 +285,21 @@ def find_manual_docs() -> list[dict]:
202285
random.shuffle(ret)
203286
return ret
204287

205-
@backup_level(90, "youtube metadata", "JSON Files pulled from the YouTube API")
288+
@backup_level(90, "youtube", "JSON Files pulled from the YouTube API")
206289
def find_youtube_metadata() -> list[dict]:
207290
return query_my_cache("parent_id = ?", (gdrive.YOUTUBE_METADATA_FOLDER_ID,))
208291

209292
add_backup_level(100, "Comprehensive", "All reasonable items")
210293

211-
@backup_level(106, 'old versions', 'download the old versions slated for deletion anyway')
294+
@backup_level(106, 'old versions', 'slated for eventual deletion anyway')
212295
def find_old_versions() -> list[dict]:
213296
return query_my_cache("parent_id = ?", (gdrive.OLD_VERSIONS_FOLDER_ID,))
214297

215-
@backup_level(112, "google docs", "All Google Docs, including the autogenerated ones")
298+
@backup_level(112, "google docs", "All GDocs, including the autogenerated ones")
216299
def find_all_gdocs() -> list[dict]:
217300
return query_my_cache("mime_type = ?", ('application/vnd.google-apps.document',))
218301

219-
@backup_level(118, "pkl files", "All files ending in .pkl (usually these are cached in NORMALIZED_TEXT_FOLDER)")
302+
@backup_level(118, "pkl files", "All files ending in .pkl (e.g. NORMALIZED_TEXT_FOLDER)")
220303
def find_all_pkl_files() -> list[dict]:
221304
return query_my_cache("name LIKE '%.pkl'")
222305

@@ -231,7 +314,11 @@ def find_all_shared_files() -> list[dict]:
231314
def download_file_to_cache(file: dict, verbose=True) -> str | None:
232315
"""Will try its best, following shortcuts, exporting docs, etc."""
233316
if file['mimeType'] == 'application/vnd.google-apps.shortcut':
234-
file = gdrive.gcache.get_item(file['shortcutDetails']['targetId'])
317+
tfile = gdrive.gcache.get_item(file['shortcutDetails']['targetId'])
318+
if not tfile:
319+
print(f"WARNING: Skipping dangling shortcut \"{file['name']}\" in {file['parent_id']}")
320+
return None
321+
file = tfile
235322

236323
is_gdoc = file['mimeType'] == 'application/vnd.google-apps.document'
237324
is_gsheet = file['mimeType'] == 'application/vnd.google-apps.spreadsheet'
@@ -256,7 +343,8 @@ def download_file_to_cache(file: dict, verbose=True) -> str | None:
256343
assert isinstance(cache_dir, Path)
257344
target_path = cache_dir / hashval[:2] / f"{hashval[2:]}{extension}"
258345
if target_path.exists():
259-
print(f" Skipping already downloaded {file['name']}")
346+
if verbose:
347+
print(f" Skipping already downloaded {file['name']}")
260348
return str(target_path)
261349
target_path.parent.mkdir(exist_ok=True)
262350
if verbose:
@@ -358,7 +446,7 @@ def sideload_main(
358446
for child in file.iterdir():
359447
files.append(child)
360448
files = [f for f in files if f not in to_remove]
361-
if len(files) > 100:
449+
if len(files) > 1:
362450
file_iter = tqdm(files)
363451
else:
364452
file_iter = iter(files)
@@ -406,6 +494,38 @@ def backup_main(new_max_level: int | None=None, parallelism: int=0):
406494
run_backup_level(level, parallelism=parallelism)
407495
print(f"All files with priority <= {max_level} are now saved locally!")
408496

497+
def print_backup_levels_list():
498+
print("\033[1mGoogle Drive Backup Levels\033[0m")
499+
print(
500+
"""
501+
Backing up to a level 'n' includes all levels < n, so each level includes the levels above it in the chart below.
502+
503+
The bold levels are semantic breakpoints which add no content themselves. You're encouraged to pick one of these levels but are welcome to pick any integer you like between 0 and 127. If new levels are added in the future, they'll be added in between the existing levels at the appropriate priority.
504+
505+
The current backup levels are as follows:
506+
"""
507+
)
508+
seen_file_ids = set()
509+
cum_sum_size = 0
510+
511+
print(f"\033[4m Lvl: {'Level Name':<16}{'Est. Size':>9} - {'Description'}\033[0m")
512+
for lvl, bl in BACKUP_LEVELS.items():
513+
if bl.finder is None:
514+
print(f"\033[1m {lvl:3d}: {'^'+bl.name+'^':<16}{"\"" if bl.level else "0 B":^9} - {bl.description}\033[0m")
515+
else:
516+
files = bl.finder()
517+
this_level_inc_size = 0
518+
this_level_overlap_size = 0
519+
while files:
520+
file = files.pop()
521+
if file['id'] in seen_file_ids:
522+
this_level_overlap_size += file['size']
523+
else:
524+
this_level_inc_size += file['size']
525+
seen_file_ids.add(file['id'])
526+
cum_sum_size += this_level_inc_size
527+
print(f" {lvl:3d}: {bl.name:<16}{format_size(cum_sum_size):>9} - {bl.description}")
528+
409529
if __name__ == "__main__":
410530
import argparse
411531
parser = argparse.ArgumentParser(
@@ -480,13 +600,18 @@ def backup_level(value):
480600
if args.command == "sideload":
481601
sideload_main(args.files, args.parent_folder, move=(not args.copy), recurse=args.recursive, check=args.replace)
482602
elif args.command == "backup":
603+
with yaspin(text="Loading website data..."):
604+
website.load()
605+
website.data.linked_ids = set()
606+
for item in website.content:
607+
if not item.get('external_url') and not item.get('file_links'):
608+
continue
609+
for drive_link in item.get('drive_links', []):
610+
website.data.linked_ids.add(
611+
gdrive.link_to_id(drive_link)
612+
)
483613
if args.list_levels:
484-
print("Available Backup Levels:")
485-
for lvl, bl in BACKUP_LEVELS.items():
486-
if bl.finder is None:
487-
print(f"\033[1m {lvl:3d}: {bl.name:<15} - {bl.description}\033[0m")
488-
else:
489-
print(f" {lvl:3d}: {bl.name:<15} - {bl.description}")
614+
print_backup_levels_list()
490615
else:
491616
backup_main(new_max_level=args.level, parallelism=args.threads)
492617
else:

‎scripts/local_gdrive.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ def get_shortcuts_in_folder(self, parent_id: str) -> List[Dict[str, Any]]:
613613
)
614614

615615
@locked
616-
def get_subfolders(self, parent_id: str, include_shortcuts=True) -> List[Dict[str, Any]]:
616+
def get_subfolders(self: DriveCache, parent_id: str, include_shortcuts=True) -> List[Dict[str, Any]]:
617617
"""
618618
Returns immediate subfolders under parent_id
619619

‎scripts/website.py‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11

2+
from typing import Iterator
23
from typing import Any
34
import subprocess
45
import json
@@ -93,7 +94,7 @@ def sortChildren(self):
9394

9495
def get(self, tag: str):
9596
return self.tags.get(tag)
96-
def __iter__(self):
97+
def __iter__(self) -> Iterator[TagFile]:
9798
for filename in config['collections']['tags']['order']:
9899
yield self.tags[filename[:-3]]
99100
def __len__(self):

0 commit comments

Comments
 (0)