Skip to content

Commit aed035d

Browse files
committed
Some sideload improvements
[skip ci]
1 parent d1b5a14 commit aed035d

2 files changed

Lines changed: 121 additions & 45 deletions

File tree

‎scripts/gdrive_cache.py‎

Lines changed: 76 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ def decorator(func: Callable[[], list[dict]]):
7070
return decorator
7171

7272
add_backup_level(0, "Cache Only", "Don't proactively fill the cache at all")
73+
74+
# @backup_level(1, "lone published", "Non-video files included but without a website source")
75+
7376
add_backup_level(10, "High", "All valuable items in need of backing up")
7477
add_backup_level(30, "Medium", "All items in active need of backing up")
7578

@@ -292,35 +295,50 @@ def run_backup_level(level: BackupLevel, parallelism=14):
292295
graceful_threadmap(download_file_to_cache, files, unit='f', max_workers=parallelism)
293296
print(f"Done backing up to level {level.level}!")
294297

295-
def sideload_file(file: Path, cache_dir: Path, parent_folder: str | None, move: bool):
298+
def sideload_file(file: Path, cache_dir: Path, parent_folder: str | None, move: bool, check: bool):
296299
"""moves (or copies, if not `move`) `file` into `cache_dir`
297300
298301
If the file doesn't exist in `gdrive.gcache` then it's uploaded to `parent_folder` (else skipped)"""
299302
assert cache_dir.is_dir()
300303
hashval = md5(file)
301-
remote_files = gdrive.gcache.get_items_with_md5(hashval)
302-
if not remote_files:
304+
target_path = gdrive.gcache.get_cache_path_for_md5(hashval)
305+
if not target_path:
303306
if not parent_folder:
304307
print(f"WARNING: Skipping untracked file {file}")
305308
return
306309
newid = gdrive.gcache.upload_file(file, folder_id=parent_folder)
307-
remote_files = [gdrive.gcache.get_item(newid)]
308-
target_path = cache_dir / hashval[:2] / f"{hashval[2:]}{file.suffix.lower()}"
310+
target_path = gdrive.gcache.get_cache_path_for_md5(hashval)
311+
assert target_path is not None
312+
assert target_path.suffix == file.suffix.lower(), f"How did we get a different extension {target_path.suffix} for {file}?"
313+
is_in_trash = target_path.parent.parent.parent.name == 'trash'
314+
if is_in_trash:
315+
if target_path.exists() and md5(target_path) != hashval:
316+
new_path = target_path.with_stem(file.stem)
317+
if new_path.exists() and md5(new_path) != hashval:
318+
raise FileExistsError(f"{new_path} also exists with a different file. Idk what to do now")
319+
target_path = new_path
320+
print(f"WARNING: File was trashed. Placing in {target_path}")
309321
if target_path.exists():
310-
if md5(target_path) == hashval:
322+
if not check or md5(target_path) == hashval:
311323
if move:
312324
file.unlink()
313325
return
314-
print(f"WARNING: Overwriting old, corrupted {target_path}")
326+
print(f"Found corrupted: {target_path}")
315327
target_path.unlink()
316-
target_path.parent.mkdir(exist_ok=True)
328+
target_path.parent.mkdir(exist_ok=True, parents=is_in_trash)
317329
if move:
318330
file.rename(target_path)
319331
else:
320332
shutil.copy2(file, target_path)
321333

322334

323-
def sideload_main(files: Collection[Path], parent_folder: str | None = None, move: bool = True):
335+
def sideload_main(
336+
files: Collection[Path],
337+
parent_folder: str | None = None,
338+
move: bool = True,
339+
recurse: bool = False,
340+
check: bool = False,
341+
):
324342
if parent_folder:
325343
if parent_folder.startswith(gdrive.FOLDER_LINK_PREFIX):
326344
parent_folder = gdrive.folderlink_to_id(parent_folder)
@@ -329,17 +347,23 @@ def sideload_main(files: Collection[Path], parent_folder: str | None = None, mov
329347
raise ValueError(f"Folder with ID {parent_folder} not found")
330348
if folder['mimeType'] != 'application/vnd.google-apps.folder':
331349
raise ValueError(f"{parent_folder} is not a Google Drive Folder, but a {folder['mimeType']}")
350+
to_remove = set()
351+
for file in files:
352+
if not file.exists():
353+
raise FileNotFoundError(file)
354+
if file.is_dir():
355+
if not recurse:
356+
raise ValueError(f"{file} is a directory! Please specify files or use -r")
357+
to_remove.add(file)
358+
for child in file.iterdir():
359+
files.append(child)
360+
files = [f for f in files if f not in to_remove]
332361
if len(files) > 100:
333362
file_iter = tqdm(files)
334363
else:
335364
file_iter = iter(files)
336365
for file in file_iter:
337-
if not file.exists():
338-
print(f"WARNING: {file} does not exist!")
339-
continue
340-
if file.is_dir():
341-
raise ValueError(f"{file} is a directory! Please only specify specific files")
342-
sideload_file(file, gdrive.gcache.file_cache_dir, parent_folder, move)
366+
sideload_file(file, gdrive.gcache.file_cache_dir, parent_folder, move, check)
343367

344368
def get_saved_backup_level() -> int | None:
345369
with gdrive.gcache._lock:
@@ -358,6 +382,29 @@ def save_backup_level(level: int):
358382
)
359383
gdrive.gcache.conn.commit()
360384

385+
def backup_main(new_max_level: int | None=None, parallelism: int=0):
386+
import sys
387+
if new_max_level is not None:
388+
save_backup_level(new_max_level)
389+
max_level = new_max_level
390+
else:
391+
max_level = get_saved_backup_level()
392+
if max_level is None:
393+
print("ERROR: No backup level supplied and no previous level found in the database. Please provide a --level.", file=sys.stderr)
394+
sys.exit(1)
395+
if max_level == 0:
396+
print('The cache is set to "cach only" mode. Nothing further to do.')
397+
sys.exit(0)
398+
print(f"Will now back up GDrive to a level {max_level}")
399+
for level in BACKUP_LEVELS.values():
400+
if not level.finder:
401+
continue
402+
if level.level > max_level:
403+
break
404+
if parallelism < 1:
405+
parallelism = 14
406+
run_backup_level(level, parallelism=parallelism)
407+
print(f"All files with priority <= {max_level} are now saved locally!")
361408

362409
if __name__ == "__main__":
363410
import argparse
@@ -412,14 +459,26 @@ def backup_level(value):
412459
help="Copy files in (default: move)",
413460
default=False,
414461
)
462+
sideload.add_argument(
463+
"--recursive", "-r",
464+
action="store_true",
465+
help="Allow sideload to crawl directories",
466+
default=False,
467+
)
468+
sideload.add_argument(
469+
'--replace', '-f',
470+
action="store_true",
471+
default=False,
472+
help="Don't assume the existing cache files are good",
473+
)
415474

416475
args = parser.parse_args()
417476

418477
if not gdrive.gcache.file_cache_dir:
419478
gdrive.gcache.set_file_cache_dir()
420479

421480
if args.command == "sideload":
422-
sideload_main(args.files, args.parent_folder, move=(not args.copy))
481+
sideload_main(args.files, args.parent_folder, move=(not args.copy), recurse=args.recursive, check=args.replace)
423482
elif args.command == "backup":
424483
if args.list_levels:
425484
print("Available Backup Levels:")
@@ -429,27 +488,6 @@ def backup_level(value):
429488
else:
430489
print(f" {lvl:3d}: {bl.name:<15} - {bl.description}")
431490
else:
432-
import sys
433-
if args.level is not None:
434-
save_backup_level(args.level)
435-
else:
436-
args.level = get_saved_backup_level()
437-
if args.level is None:
438-
print("ERROR: No backup level supplied and no previous level found in the database. Please provide a --level.", file=sys.stderr)
439-
sys.exit(1)
440-
if args.level == 0:
441-
print('The cache is set to "cach only" mode. Nothing further to do.')
442-
sys.exit(0)
443-
print(f"Will now back up GDrive to a level {args.level}")
444-
for level in BACKUP_LEVELS.values():
445-
if not level.finder:
446-
continue
447-
if level.level > args.level:
448-
break
449-
parallelism = args.threads
450-
if parallelism < 1:
451-
parallelism = 14
452-
run_backup_level(level, parallelism=parallelism)
453-
print(f"All files with priority <= {args.level} are now saved locally!")
491+
backup_main(new_max_level=args.level, parallelism=args.threads)
454492
else:
455493
parser.print_help()

‎scripts/local_gdrive.py‎

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66
from typing import List, Dict, Any, Optional, Callable, TypedDict, TypeVar, ParamSpec, Concatenate
77
from time import sleep
8+
from mimetypes import guess_extension
89

910
import gdrive_base
1011

@@ -171,6 +172,12 @@ def _create_table(self):
171172
self.cursor.execute(create_trashed_items_sql)
172173
self.cursor.execute(create_properties_table_sql)
173174
self.cursor.executescript(create_index_sql)
175+
176+
try:
177+
self.cursor.execute(f"ALTER TABLE trashed_drive_items ADD COLUMN trashed_time TEXT")
178+
except sqlite3.OperationalError:
179+
pass # Already exists
180+
174181
self.conn.commit()
175182

176183
def _prompt_for_file_cache_dir(self) -> Path:
@@ -373,7 +380,7 @@ def update(self):
373380
with yaspin(text="Pulling latest data from GDrive...") as ys:
374381
changes_page = changes_page['value']
375382
file_ids_to_fetch = set()
376-
file_ids_removed = set()
383+
file_ids_removed = {} # map id -> time
377384
while True:
378385
changelist = gdrive_base.session().changes().list(includeRemoved=True, restrictToMyDrive=False, pageToken=changes_page, pageSize=1000).execute()
379386
for change in changelist['changes']:
@@ -385,7 +392,7 @@ def update(self):
385392
file = self.get_trashed_item(change['fileId'])
386393
if file:
387394
print(f"Trashed item was permanently deleted: \"{file['name']}\"")
388-
file_ids_removed.add(change['fileId'])
395+
file_ids_removed[change['fileId']] = change.get('time')
389396
else:
390397
file_ids_to_fetch.add(change['fileId'])
391398
if 'nextPageToken' in changelist: # nextPageToken signals there is more to fetch
@@ -394,9 +401,9 @@ def update(self):
394401
changes_page = changelist['newStartPageToken'] # newStartPageToken says come back later for more
395402
break
396403
if len(file_ids_removed):
397-
for fileId in file_ids_removed:
398-
self._move_to_trash(fileId)
399-
file_ids_to_fetch = list(file_ids_to_fetch - file_ids_removed)
404+
for fileId, time in file_ids_removed.items():
405+
self._move_to_trash(fileId, trashed_time=time)
406+
file_ids_to_fetch = list(file_ids_to_fetch - set(file_ids_removed.keys()))
400407
if len(file_ids_to_fetch):
401408
all_items = gdrive_base.batch_get_files_by_id(file_ids_to_fetch, FILE_FIELDS)
402409
for item in tqdm(all_items, total=len(file_ids_to_fetch), desc="Fetching updated files"):
@@ -683,6 +690,25 @@ def find_duplicate_md5s(self) -> List[str]:
683690
self.cursor.execute(sql)
684691
return [row['md5_checksum'] for row in self.cursor.fetchall()]
685692

693+
def get_cache_path_for_md5(self, hashval: str) -> Path | None:
694+
"""Returns None if the hashval is unknown to me"""
695+
assert len(hashval) == 32
696+
remote_files = self.get_items_with_md5(hashval)
697+
if not remote_files:
698+
remote_files = self.get_trashed_items_with_md5(hashval)
699+
if remote_files:
700+
rm_date = max(f['trashed_time'] or f['modifiedTime'] for f in remote_files)
701+
rm_date = datetime.fromisoformat(rm_date)
702+
return self.file_cache_dir / 'trash' / str(rm_date.year) / f"{rm_date.month:02d}" / remote_files[0]['name']
703+
return None
704+
file = remote_files[0]
705+
extension = ''
706+
if '.' in file['name']:
707+
extension = '.' + str(file['name']).split('.')[-1].lower()
708+
if len(extension) < 1 or len(extension) > 6:
709+
extension = guess_extension(file['mimeType']) or ''
710+
return self.file_cache_dir / hashval[:2] / f"{hashval[2:]}{extension}"
711+
686712
########
687713
# Write-through Functions
688714
#
@@ -701,14 +727,26 @@ def trash_file(self, file_id: str):
701727

702728
gdrive_base.trash_drive_file(file_id)
703729
with self._lock:
704-
self._move_to_trash(file_id)
730+
self._move_to_trash(file_id, trashed_time=UTC_NOW())
705731
self.conn.commit()
706732

707733
@locked
708-
def _move_to_trash(self, file_id: str):
734+
def _move_to_trash(self, file_id: str, trashed_time: str = None):
735+
# If we get a removal event from the API for a file already in the trash,
736+
# only add the timestamp to the trash table if the item had a NULL trashed time before.
737+
self.cursor.execute("SELECT trashed_time FROM trashed_drive_items WHERE id = ?", (file_id,))
738+
row = self.cursor.fetchone()
739+
if row:
740+
if trashed_time and row['trashed_time'] is None:
741+
self.cursor.execute("UPDATE trashed_drive_items SET trashed_time = ? WHERE id = ?", (trashed_time, file_id))
742+
return
743+
709744
self.cursor.execute("INSERT INTO trashed_drive_items SELECT * FROM drive_items WHERE id = ?", (file_id,))
710745
self.cursor.execute("DELETE FROM drive_items WHERE id = ?", (file_id,))
711746

747+
if trashed_time:
748+
self.cursor.execute("UPDATE trashed_drive_items SET trashed_time = ? WHERE id = ?", (trashed_time, file_id))
749+
712750
def move_file(self, file_id: str, folder: str, previous_parents=None, verbose=True):
713751
folder = gdrive_base.folderlink_to_id(folder) if folder.startswith("http") else folder
714752
with self._lock:

0 commit comments

Comments
 (0)