11#!/bin/python3
22
3+ from enum import StrEnum
34from collections .abc import Collection
45from collections import OrderedDict , deque
56import random
1213import shutil
1314from strutils import (
1415 md5 ,
16+ format_size ,
1517)
1618from executils import graceful_threadmap
1719from yaspin import yaspin
@@ -71,7 +73,87 @@ def decorator(func: Callable[[], list[dict]]):
7173
7274add_backup_level (0 , "Cache Only" , "Don't proactively fill the cache at all" )
7375
74- # @backup_level(1, "lone published", "Non-video files included but without a website source")
76+ class TagFolderTypes (StrEnum ):
77+ PUBLIC = 'public'
78+ PRIVATE = 'private'
79+ NONTAG_PUBLIC_SUBS = 'public/*'
80+ UNREAD = 'private/unread'
81+ ARCHIVE = 'private/archive'
82+
83+ def get_folders_of_types_for_tag (tag : website .TagFile , include_folders : Collection [TagFolderTypes ]) -> list [dict ]:
84+ folder_json = gdrive .FOLDERS_DATA ()
85+ if tag .slug not in folder_json :
86+ return []
87+ private_link = folder_json [tag .slug ]['private' ]
88+ public_link = folder_json [tag .slug ]['public' ]
89+ public_id = gdrive .folderlink_to_id (public_link ) if public_link else None
90+ private_id = gdrive .folderlink_to_id (private_link ) if private_link else None
91+ ret = []
92+ if private_id :
93+ private_subfolders = gdrive .gcache .get_subfolders (
94+ parent_id = private_id ,
95+ include_shortcuts = False ,
96+ )
97+ else :
98+ private_subfolders = []
99+ for inc_type in set (include_folders ):
100+ match inc_type :
101+ case TagFolderTypes .PUBLIC :
102+ if public_id :
103+ ret .append (gdrive .gcache .get_item (public_id ))
104+ case TagFolderTypes .PRIVATE :
105+ if private_id :
106+ ret .append (gdrive .gcache .get_item (private_id ))
107+ case TagFolderTypes .NONTAG_PUBLIC_SUBS :
108+ if public_id :
109+ folderids_to_tag = gdrive .load_folder_slugs ()
110+ for sf in gdrive .gcache .get_subfolders (parent_id = public_id , include_shortcuts = False ):
111+ if sf ['id' ] not in folderids_to_tag :
112+ ret .append (sf )
113+ case TagFolderTypes .UNREAD :
114+ for sf in private_subfolders :
115+ if 'unread' in sf ['name' ].lower ():
116+ ret .append (sf )
117+ case TagFolderTypes .ARCHIVE :
118+ for sf in private_subfolders :
119+ if sf ['name' ].lower ().startswith ('archive' ):
120+ ret .append (sf )
121+ return ret
122+
123+ @backup_level (1 , "lone published" , "Non-av files in tags without a source" )
124+ def find_unlinked_tag_content (
125+ include_folders : set [TagFolderTypes ]= {
126+ TagFolderTypes .PUBLIC ,
127+ TagFolderTypes .NONTAG_PUBLIC_SUBS ,
128+ TagFolderTypes .PRIVATE ,
129+ },
130+ include_av = False
131+ ) -> list [dict ]:
132+ ret = []
133+ for tag in website .tags :
134+ folders = get_folders_of_types_for_tag (tag , include_folders )
135+ if not folders :
136+ continue
137+ nqs = ['?' ] * len (folders )
138+ nqs = ', ' .join (nqs )
139+ bad_mime_prefixes = ['application/vnd.google-apps' ]
140+ if not include_av :
141+ bad_mime_prefixes .extend ([
142+ 'audio/' ,
143+ 'video/' ,
144+ 'application/zip' ,
145+ ])
146+ bad_mime_prefixes = [
147+ f"mime_type LIKE '{ prefix } %'"
148+ for prefix in bad_mime_prefixes
149+ ]
150+ all_files = gdrive .gcache .sql_query (
151+ f"parent_id IN ({ nqs } ) AND NOT ({ ' OR ' .join (bad_mime_prefixes )} )" ,
152+ tuple (sf ['id' ] for sf in folders ),
153+ )
154+ random .shuffle (all_files )
155+ ret .extend (all_files )
156+ return ret
75157
76158add_backup_level (10 , "High" , "All valuable items in need of backing up" )
77159add_backup_level (30 , "Medium" , "All items in active need of backing up" )
@@ -92,17 +174,17 @@ def find_eks_files() -> list[str]:
92174def find_academia_edu_pdfs () -> list [dict ]:
93175 return query_parent_name (BULK_PDF_FOLDER_NAMES [BulkPDFType .ACADEMIA_EDU ])
94176
95- add_backup_level (60 , "Low" , "All valuable items, including those backed up elsewhere" )
177+ add_backup_level (60 , "Low" , "All valuable items, even backed up elsewhere" )
96178
97- @backup_level (66 , 'core api pdfs ' , "The PDFs pulled from CORE yet unsorted" )
179+ @backup_level (66 , 'core api' , "The PDFs pulled from CORE yet unsorted" )
98180def find_unsorted_core_pdfs () -> list [dict ]:
99181 return query_parent_name (BULK_PDF_FOLDER_NAMES [BulkPDFType .CORE_API ])
100182
101- @backup_level (72 , 'rejects' , "Saves files actively rejected from the library" )
183+ @backup_level (72 , 'rejects' , "actively rejected from the library" )
102184def find_rejected_files () -> list [dict ]:
103185 return query_my_cache ("parent_id = ?" , (gdrive .REJECTS_FOLDER_ID ,))
104186
105- @backup_level (78 , "all OBU files" , "Attempts to save every descendant of the library roots" )
187+ @backup_level (78 , "all obu files" , "every descendant of the library roots" )
106188def find_all_obu_files (filter_fn : Callable [[dict ], bool ]= None ) -> list [dict ]:
107189 ret = []
108190 folders_data = gdrive .FOLDERS_DATA ()
@@ -148,6 +230,7 @@ def find_all_obu_pdfs() -> list[dict]:
148230def list_one_off_docs () -> list [dict ]:
149231 return [
150232 gdrive .gcache .get_item (fid ) for fid in [
233+ '1TN6KzqD7-dEwcEJ9cs9qykuUHT_QRMpm7TVD8cT_biU' ,
151234 '1NNlHLr928Mb-NRiJKjZxdwTrsYY7cSZdR_3KulvjiYA' ,
152235 '1Yi6evYG0NsdYzVBO7o8XrCIHo3dApJ4DmlXraerzZFw' ,
153236 ]
@@ -181,7 +264,7 @@ def find_all_obu_text_docs() -> list[dict]:
181264def find_all_obu_audio_files () -> list [dict ]:
182265 return find_all_obu_files (lambda f : f ['mimeType' ].startswith ('audio' ))
183266
184- @backup_level (84 , "google docs and sheets " , "My manually created docs " )
267+ @backup_level (84 , "google docs" , "My manually created Docs and Sheets " )
185268def find_manual_docs () -> list [dict ]:
186269 with gdrive .gcache ._lock :
187270 ret = gdrive .gcache .cursor .execute (
@@ -202,21 +285,21 @@ def find_manual_docs() -> list[dict]:
202285 random .shuffle (ret )
203286 return ret
204287
205- @backup_level (90 , "youtube metadata " , "JSON Files pulled from the YouTube API" )
288+ @backup_level (90 , "youtube" , "JSON Files pulled from the YouTube API" )
206289def find_youtube_metadata () -> list [dict ]:
207290 return query_my_cache ("parent_id = ?" , (gdrive .YOUTUBE_METADATA_FOLDER_ID ,))
208291
209292add_backup_level (100 , "Comprehensive" , "All reasonable items" )
210293
211- @backup_level (106 , 'old versions' , 'download the old versions slated for deletion anyway' )
294+ @backup_level (106 , 'old versions' , 'slated for eventual deletion anyway' )
212295def find_old_versions () -> list [dict ]:
213296 return query_my_cache ("parent_id = ?" , (gdrive .OLD_VERSIONS_FOLDER_ID ,))
214297
215- @backup_level (112 , "google docs" , "All Google Docs , including the autogenerated ones" )
298+ @backup_level (112 , "google docs" , "All GDocs , including the autogenerated ones" )
216299def find_all_gdocs () -> list [dict ]:
217300 return query_my_cache ("mime_type = ?" , ('application/vnd.google-apps.document' ,))
218301
219- @backup_level (118 , "pkl files" , "All files ending in .pkl (usually these are cached in NORMALIZED_TEXT_FOLDER)" )
302+ @backup_level (118 , "pkl files" , "All files ending in .pkl (e.g. NORMALIZED_TEXT_FOLDER)" )
220303def find_all_pkl_files () -> list [dict ]:
221304 return query_my_cache ("name LIKE '%.pkl'" )
222305
@@ -231,7 +314,11 @@ def find_all_shared_files() -> list[dict]:
231314def download_file_to_cache (file : dict , verbose = True ) -> str | None :
232315 """Will try its best, following shortcuts, exporting docs, etc."""
233316 if file ['mimeType' ] == 'application/vnd.google-apps.shortcut' :
234- file = gdrive .gcache .get_item (file ['shortcutDetails' ]['targetId' ])
317+ tfile = gdrive .gcache .get_item (file ['shortcutDetails' ]['targetId' ])
318+ if not tfile :
319+ print (f"WARNING: Skipping dangling shortcut \" { file ['name' ]} \" in { file ['parent_id' ]} " )
320+ return None
321+ file = tfile
235322
236323 is_gdoc = file ['mimeType' ] == 'application/vnd.google-apps.document'
237324 is_gsheet = file ['mimeType' ] == 'application/vnd.google-apps.spreadsheet'
@@ -256,7 +343,8 @@ def download_file_to_cache(file: dict, verbose=True) -> str | None:
256343 assert isinstance (cache_dir , Path )
257344 target_path = cache_dir / hashval [:2 ] / f"{ hashval [2 :]} { extension } "
258345 if target_path .exists ():
259- print (f" Skipping already downloaded { file ['name' ]} " )
346+ if verbose :
347+ print (f" Skipping already downloaded { file ['name' ]} " )
260348 return str (target_path )
261349 target_path .parent .mkdir (exist_ok = True )
262350 if verbose :
@@ -358,7 +446,7 @@ def sideload_main(
358446 for child in file .iterdir ():
359447 files .append (child )
360448 files = [f for f in files if f not in to_remove ]
361- if len (files ) > 100 :
449+ if len (files ) > 1 :
362450 file_iter = tqdm (files )
363451 else :
364452 file_iter = iter (files )
@@ -406,6 +494,38 @@ def backup_main(new_max_level: int | None=None, parallelism: int=0):
406494 run_backup_level (level , parallelism = parallelism )
407495 print (f"All files with priority <= { max_level } are now saved locally!" )
408496
497+ def print_backup_levels_list ():
498+ print ("\033 [1mGoogle Drive Backup Levels\033 [0m" )
499+ print (
500+ """
501+ Backing up to a level 'n' includes all levels < n, so each level includes the levels above it in the chart below.
502+
503+ The bold levels are semantic breakpoints which add no content themselves. You're encouraged to pick one of these levels but are welcome to pick any integer you like between 0 and 127. If new levels are added in the future, they'll be added in between the existing levels at the appropriate priority.
504+
505+ The current backup levels are as follows:
506+ """
507+ )
508+ seen_file_ids = set ()
509+ cum_sum_size = 0
510+
511+ print (f"\033 [4m Lvl: { 'Level Name' :<16} { 'Est. Size' :>9} - { 'Description' } \033 [0m" )
512+ for lvl , bl in BACKUP_LEVELS .items ():
513+ if bl .finder is None :
514+ print (f"\033 [1m { lvl :3d} : { '^' + bl .name + '^' :<16} { "\" " if bl .level else "0 B" :^9} - { bl .description } \033 [0m" )
515+ else :
516+ files = bl .finder ()
517+ this_level_inc_size = 0
518+ this_level_overlap_size = 0
519+ while files :
520+ file = files .pop ()
521+ if file ['id' ] in seen_file_ids :
522+ this_level_overlap_size += file ['size' ]
523+ else :
524+ this_level_inc_size += file ['size' ]
525+ seen_file_ids .add (file ['id' ])
526+ cum_sum_size += this_level_inc_size
527+ print (f" { lvl :3d} : { bl .name :<16} { format_size (cum_sum_size ):>9} - { bl .description } " )
528+
409529if __name__ == "__main__" :
410530 import argparse
411531 parser = argparse .ArgumentParser (
@@ -480,13 +600,18 @@ def backup_level(value):
480600 if args .command == "sideload" :
481601 sideload_main (args .files , args .parent_folder , move = (not args .copy ), recurse = args .recursive , check = args .replace )
482602 elif args .command == "backup" :
603+ with yaspin (text = "Loading website data..." ):
604+ website .load ()
605+ website .data .linked_ids = set ()
606+ for item in website .content :
607+ if not item .get ('external_url' ) and not item .get ('file_links' ):
608+ continue
609+ for drive_link in item .get ('drive_links' , []):
610+ website .data .linked_ids .add (
611+ gdrive .link_to_id (drive_link )
612+ )
483613 if args .list_levels :
484- print ("Available Backup Levels:" )
485- for lvl , bl in BACKUP_LEVELS .items ():
486- if bl .finder is None :
487- print (f"\033 [1m { lvl :3d} : { bl .name :<15} - { bl .description } \033 [0m" )
488- else :
489- print (f" { lvl :3d} : { bl .name :<15} - { bl .description } " )
614+ print_backup_levels_list ()
490615 else :
491616 backup_main (new_max_level = args .level , parallelism = args .threads )
492617 else :
0 commit comments