@@ -437,7 +437,16 @@ def process_duplicate_files(files: list[dict[str, any]], folder_slugs: dict[str,
437437 gcache .trash_file (f ['id' ])
438438 return files_to_keep
439439
440- def select_ids_to_keep (files : list [dict [str , any ]], folder_slugs : dict [str , str ]) -> tuple [list [str ], str ]:
440+ class IDSelectionReason (enum .StrEnum ):
441+ IS_PUBLIC = 'is public'
442+ GENERIC_SUBFOLDER = 'generic subfolder'
443+ TAG_PRIORITY = 'tag priority'
444+ NAME_LENGTH = 'name length'
445+ ELDEST_FILE = 'eldest file'
446+ FOLDER_DEPTH = 'folder depth'
447+
448+
449+ def select_ids_to_keep (files : list [dict [str , any ]], folder_slugs : dict [str , str ]) -> tuple [list [str ], IDSelectionReason ]:
441450 """Maticulously applies hand-crafted heuristics to select the keepers
442451
443452 folder_slugs is a map from gid to tag slug, passed in to avoid recompute
@@ -487,7 +496,7 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
487496 if num_slugs == 1 :
488497 # if there's only one file in a slugged folder, keep that one
489498 # no need to even check for permissions
490- return [files [slugs .index (important_slugs [0 ])]['id' ]], 'is public'
499+ return [files [slugs .index (important_slugs [0 ])]['id' ]], IDSelectionReason . IS_PUBLIC
491500
492501 #####
493502 # Don't trash any publicly-launched files
@@ -497,7 +506,7 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
497506 num_public = sum (are_publics )
498507 if num_public > 0 :
499508 # Never suggest a public-facing file for deletion
500- return [files [i ]['id' ] for i in range (len (files )) if are_publics [i ]], 'is public'
509+ return [files [i ]['id' ] for i in range (len (files )) if are_publics [i ]], IDSelectionReason . IS_PUBLIC
501510
502511 #####
503512 # Discard files in "unimportant" subfolders first
@@ -514,7 +523,7 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
514523 if unread_count > 0 and unread_count < len (files ):
515524 files = [file for i , file in enumerate (files ) if not unreads [i ]]
516525 if len (files ) == 1 :
517- return [files [0 ]['id' ]], 'generic subfolder'
526+ return [files [0 ]['id' ]], IDSelectionReason . GENERIC_SUBFOLDER
518527 slugs = [slug for i , slug in enumerate (slugs ) if not unreads [i ]]
519528
520529 #####
@@ -529,7 +538,7 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
529538 assert len (files ) == len (priorities )
530539 files = [files [i ] for i in range (len (files )) if priorities [i ] == highest ]
531540 if len (files ) == 1 :
532- return [files [0 ]['id' ]], 'tag priority'
541+ return [files [0 ]['id' ]], IDSelectionReason . TAG_PRIORITY
533542
534543 #####
535544 # If some couldn't be disambiguated by folder because they are in
@@ -542,12 +551,12 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
542551 longest = max (name_lens )
543552 files = [file for file in files if len (file ['name' ])== longest ]
544553 if len (files ) == 1 :
545- return [file ['id' ] for file in files ], 'name length'
554+ return [file ['id' ] for file in files ], IDSelectionReason . NAME_LENGTH
546555 # That failing, pick the eldest
547556 modifies = [file ['modifiedTime' ] for file in files ]
548557 eldest = min (modifies )
549558 idx = modifies .index (eldest )
550- return [files [idx ]['id' ]], 'eldest file'
559+ return [files [idx ]['id' ]], IDSelectionReason . ELDEST_FILE
551560
552561 #####
553562 # Disambiguate remaining folders by depth
@@ -572,7 +581,7 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
572581 file ['root' ] = parent
573582 roots = set (file ['root' ]['id' ] for file in files )
574583 assert len (roots ) == 1 , f"Multiple roots found for { files } "
575- return [deepest ['id' ]], 'folder depth'
584+ return [deepest ['id' ]], IDSelectionReason . FOLDER_DEPTH
576585
577586def remote_file_for_local_file (fp : Path , folder_slugs : dict [str , str ], default_folder_id = None ) -> dict | None :
578587 """Ensures that there is exactly one copy of `fp` on Drive and returns it.
@@ -633,6 +642,21 @@ def find_duplicate_urls() -> list[str]:
633642 gcache .cursor .execute (sql )
634643 return [row ['value' ] for row in gcache .cursor .fetchall ()]
635644
645+ def fetch_files_distinction_pointer (gc : local_gdrive .DriveCache , file_id : str ) -> str | None :
646+ with gc ._lock :
647+ gc .cursor .execute ("SELECT value FROM item_properties WHERE key = 'distinctFrom' AND file_id = ?" , (file_id ,))
648+ pointing_to = gc .cursor .fetchone ()
649+ if not pointing_to :
650+ return None
651+ return pointing_to ['value' ]
652+
653+ def fetch_distinct_file_pointing_to (gc : local_gdrive , target_file_id : str ) -> str | None :
654+ with gc ._lock :
655+ gc .cursor .execute ("SELECT file_id FROM item_properties WHERE key = 'distinctFrom' AND value = ?" , (target_file_id ,))
656+ pointing_neighbor = gc .cursor .fetchone ()
657+ if pointing_neighbor :
658+ return pointing_neighbor ['file_id' ]
659+ return None
636660
637661class ClosePairDecision (enum .StrEnum ):
638662 FIRST_IS_OLD_VERSION = 'old a'
@@ -678,6 +702,7 @@ class FileDistinctionManager:
678702 def __init__ (self , gc : local_gdrive .DriveCache = None ):
679703 if gc is None :
680704 gc = gcache
705+ self ._folder_slugs = None
681706 self .gcache = gc
682707 self .fileid_to_distinct_neighbors : dict [str , set [str ]]
683708 self .fileid_to_distinct_neighbors = dict ()
@@ -694,7 +719,7 @@ def __init__(self, gc: local_gdrive.DriveCache=None):
694719 )
695720 ]
696721 pointers = {p ['file_id' ]: p ['value' ] for p in pointers }
697- pointers = self .fix_pointers (pointers )
722+ pointers = self ._fix_pointers (pointers )
698723 for k in pointers .keys ():
699724 self .fileid_to_distinct_neighbors [k ] = set ()
700725 node = pointers [k ]
@@ -706,6 +731,104 @@ def __init__(self, gc: local_gdrive.DriveCache=None):
706731 def are_distinct (self , file_a : str , file_b : str ) -> bool :
707732 """Returns True iff file_a and file_b are marked distinct already"""
708733 return file_a in self .fileid_to_distinct_neighbors and file_b in self .fileid_to_distinct_neighbors [file_a ]
734+
735+ def folder_slugs (self ):
736+ if self ._folder_slugs :
737+ return self ._folder_slugs
738+ self ._folder_slugs = load_folder_slugs ()
739+ return self ._folder_slugs
740+
741+ def handle_close_pair_decision (self , decision : ClosePairDecision , actual_file_a : dict , actual_other_file : dict ):
742+ """
743+ Handles whatever file moving, trashing, and Distinction pointer swapping as needed to actualize `decision`
744+
745+ Args:
746+ `decision` is relative to `is_duplicate_prompt(actual_file_a, actual_other_file)` in that order
747+ """
748+ if decision == ClosePairDecision .THEY_ARE_DISTINCT :
749+ return self .mark_distinct (actual_other_file ['id' ], actual_file_a ['id' ])
750+ def _print_shortcuts (shortcuts : list [dict ]):
751+ for shortcut in shortcuts :
752+ print (f" \" { shortcut ['name' ]} \" " )
753+ print (f" in { FOLDER_LINK .format (shortcut ['parent_id' ])} " )
754+ would_keep , reason = select_ids_to_keep (
755+ [actual_other_file , actual_file_a ],
756+ folder_slugs = self .folder_slugs (),
757+ )
758+ selected_to_keep = None
759+ selected_to_not = None
760+ if len (would_keep ) > 1 and decision == ClosePairDecision .THEY_ARE_THE_SAME :
761+ assert reason == IDSelectionReason .IS_PUBLIC
762+ print ("Both files are publicly launched!" )
763+ print ("Please handle manually and select one of these to keep:" )
764+ choice = radio_dial ([
765+ DRIVE_LINK .format (actual_file_a ['id' ]),
766+ DRIVE_LINK .format (actual_other_file ['id' ]),
767+ ])
768+ if choice == 0 :
769+ decision = ClosePairDecision .SECOND_IS_OLD_VERSION
770+ elif choice == 1 :
771+ decision = ClosePairDecision .FIRST_IS_OLD_VERSION
772+ else :
773+ raise ValueError ("radio_dial should output 0 or 1 for a binary choice, no?" )
774+ if decision == ClosePairDecision .FIRST_IS_OLD_VERSION :
775+ selected_to_not , selected_to_keep = actual_file_a , actual_other_file
776+ elif decision == ClosePairDecision .SECOND_IS_OLD_VERSION :
777+ selected_to_not , selected_to_keep = actual_other_file , actual_file_a
778+ else :
779+ assert decision == ClosePairDecision .THEY_ARE_THE_SAME
780+ assert len (would_keep ) == 1
781+ if would_keep [0 ] == actual_file_a ['id' ]:
782+ selected_to_keep = actual_file_a
783+ selected_to_not = actual_other_file
784+ else :
785+ assert would_keep [0 ] == actual_other_file ['id' ]
786+ selected_to_keep = actual_other_file
787+ selected_to_not = actual_file_a
788+ if len (would_keep ) == 1 and would_keep [0 ] != selected_to_keep ['id' ]:
789+ if reason == IDSelectionReason .IS_PUBLIC :
790+ print ("The file you've selected as the old version is public" )
791+ print ("Please resolve this manually and then we'll move it to Old Versions." )
792+ input ("Press enter to continue..." )
793+ elif selected_to_not ['parent_id' ] != selected_to_keep ['parent_id' ]:
794+ shortcuts = self .gcache .get_shortcuts_to_file (selected_to_keep ['id' ])
795+ if shortcuts :
796+ print (f"The file you've chosen to keep and move to { FOLDER_LINK .format (selected_to_not ['parent_id' ])} has shortcuts:" )
797+ _print_shortcuts (shortcuts )
798+ input ("Please handle them and then press enter to continue..." )
799+ self .gcache .move_file (selected_to_keep ['id' ], selected_to_not ['parent_id' ], selected_to_keep ['parents' ])
800+ if 'distinctFrom' in selected_to_not ['properties' ]:
801+ if 'distinctFrom' not in selected_to_keep ['properties' ]:
802+ # simply swap out keep for not
803+ point_to_not_id = fetch_distinct_file_pointing_to (self .gcache , selected_to_not ['id' ])
804+ self ._write_pointer (point_to_not_id , selected_to_keep ['id' ])
805+ self ._write_pointer (selected_to_keep ['id' ], selected_to_not ['properties' ]['distinctFrom' ])
806+ self ._write_pointer (selected_to_not ['id' ], None )
807+ self .fileid_to_distinct_neighbors [selected_to_keep ['id' ]] = self .fileid_to_distinct_neighbors [selected_to_not ['id' ]]
808+ del self .fileid_to_distinct_neighbors [selected_to_not ['id' ]]
809+ for n in self .fileid_to_distinct_neighbors [selected_to_keep ['id' ]]:
810+ self .fileid_to_distinct_neighbors [n ].remove (selected_to_not ['id' ])
811+ self .fileid_to_distinct_neighbors [n ].add (selected_to_keep ['id' ])
812+ else :
813+ # We can assume they aren't in the same cluster as they were just
814+ # marked as the same, ergo not distinct
815+ assert selected_to_not ['id' ] not in self .fileid_to_distinct_neighbors [selected_to_keep ['id' ]]
816+ # since these two are marked the same, we should merge their clusters into a super-cluster
817+ super_cluster = self .fileid_to_distinct_neighbors [select_ids_to_keep ['id' ]] | \
818+ self .fileid_to_distinct_neighbors [selected_to_not ['id' ]]
819+ super_cluster .add (selected_to_keep ['id' ])
820+ # TODO: There's a more efficient way to do this with snipping
821+ self ._make_new_cycle (super_cluster )
822+ self ._write_pointer (selected_to_not ['id' ], None )
823+ for n in super_cluster :
824+ nn = super_cluster .copy ()
825+ nn .remove (n )
826+ self .fileid_to_distinct_neighbors [n ] = nn
827+ # else: # the one we've marked for removal isn't part of the distinctions graph, so nothing to do here
828+ # Now, all that's left is to handle the marking!
829+ print (f"[Action] Moving old version to Old Versions..." )
830+ move_gfile (selected_to_not ['id' ], (OLD_VERSIONS_FOLDER_ID , None ))
831+ return
709832
710833 def mark_distinct (self , file_a : str , file_b : str ):
711834 """Writes to the DB the fact that file_a and file_b are distinctFrom eachother"""
@@ -721,47 +844,70 @@ def mark_distinct(self, file_a: str, file_b: str):
721844 return
722845 if file_b not in self .fileid_to_distinct_neighbors :
723846 file_a , file_b = file_b , file_a
847+ actual_file_a = self .gcache .get_item (file_a )
848+ actual_file_b = self .gcache .get_item (file_b )
849+ file_b_points_to = actual_file_b ['properties' ]['distinctFrom' ]
850+ print (f"Adding { file_a } to the { file_b } cluster:" )
851+ for other_file in self .fileid_to_distinct_neighbors [file_b ]:
852+ actual_other_file = self .gcache .get_item (other_file )
853+ decision = is_duplicate_prompt (
854+ actual_file_a ,
855+ actual_other_file ,
856+ )
857+ if decision == ClosePairDecision .THEY_ARE_DISTINCT :
858+ continue
859+ return self .handle_close_pair_decision (
860+ decision ,
861+ actual_file_a ,
862+ actual_other_file ,
863+ )
724864 if file_a not in self .fileid_to_distinct_neighbors :
725- # file_a is new, but file_b is part of a group already
726- print (f"Adding { file_a } to the { file_b } cluster:" )
727- actual_file_a = self .gcache .get_item (file_a )
728- decisions = dict ()
729- file_pointing_to_b = None
730- for other_file in self .fileid_to_distinct_neighbors [file_b ]:
731- actual_other_file = self .gcache .get_item (other_file )
732- if actual_other_file ['properties' ]['distinctFrom' ] == file_b :
733- file_pointing_to_b = actual_other_file
734- decisions [other_file ] = is_duplicate_prompt (
735- actual_other_file ,
736- actual_file_a ,
737- )
738- if all (decision == ClosePairDecision .THEY_ARE_DISTINCT for decision in decisions .values ()):
739- # Easy case. Just add it to the group
740- self ._write_pointer (file_pointing_to_b ['id' ], file_a )
741- self ._write_pointer (file_a , file_b )
742- self .fileid_to_distinct_neighbors [file_a ] = self .fileid_to_distinct_neighbors [file_b ].copy ()
743- self .fileid_to_distinct_neighbors [file_a ].add (file_b )
744- for other_fid in self .fileid_to_distinct_neighbors [file_a ]:
745- self .fileid_to_distinct_neighbors [other_fid ].add (file_a )
746- return
747- import ipdb
748- print ("Teach me how to do a complex insertion" )
749- ipdb .set_trace ()
750-
751- import ipdb
752- print ("Teach FileDistinctionManager how to handle merging two cycles" )
753- ipdb .set_trace ()
754-
755-
865+ # file_a is distinct from the entire file_b cluster
866+ # and has no cluster of its own, so just add it to the group
867+ self ._write_pointer (file_a , file_b_points_to )
868+ self ._write_pointer (file_b , file_a )
869+ self .fileid_to_distinct_neighbors [file_a ] = self .fileid_to_distinct_neighbors [file_b ].copy ()
870+ self .fileid_to_distinct_neighbors [file_a ].add (file_b )
871+ for other_fid in self .fileid_to_distinct_neighbors [file_a ]:
872+ self .fileid_to_distinct_neighbors [other_fid ].add (file_a )
873+ return
874+ # else, file_a has its own cluster
875+ print (f"Adding { file_b } to the { file_a } cluster:" )
876+ for other_file in self .fileid_to_distinct_neighbors [file_a ]:
877+ actual_other_file = self .gcache .get_item (other_file )
878+ decision = is_duplicate_prompt (
879+ actual_file_b ,
880+ actual_other_file ,
881+ )
882+ if decision == ClosePairDecision .THEY_ARE_DISTINCT :
883+ continue
884+ return self .handle_close_pair_decision (
885+ decision ,
886+ actual_file_b ,
887+ actual_other_file ,
888+ )
889+ # At this point we have two clusters and all are distinct, so merge them
890+ self ._write_pointer (file_b , actual_file_a ['properties' ]['distinctFrom' ])
891+ self ._write_pointer (file_a , file_b_points_to )
892+ super_cluster = self .fileid_to_distinct_neighbors [file_a ] | \
893+ self .fileid_to_distinct_neighbors [file_b ] | set ([file_a , file_b ])
894+ for n in super_cluster :
895+ nn = super_cluster .copy ()
896+ nn .remove (n )
897+ self .fileid_to_distinct_neighbors [n ] = nn
898+ return
756899
757900 def _write_pointer (self , from_id : str , to_id : str | None ):
758901 """Commits this new pointer to the DB"""
759902 if to_id is not None :
760903 assert re .fullmatch (GFIDREGEX , to_id ), f"_write_pointer got a non-ID: { to_id } "
904+ print (f"[Info] Marking { from_id } distinctFrom { to_id } " )
761905 self .gcache .write_property (from_id , 'distinctFrom' , to_id )
762906
763907 def _make_new_cycle (self , nodes : Iterable [str ]) -> dict [str , str ]:
764908 """Takes a collection of nodes and writes them as a cycle to the DB.
909+
910+ NOTE: Does not update self.fileid_to_distinct_neighbors (hence an _method)
765911
766912 Returns: the pointers map of which were made to point to which."""
767913 node_iter = iter (nodes )
@@ -778,7 +924,7 @@ def _make_new_cycle(self, nodes: Iterable[str]) -> dict[str, str]:
778924 return ret
779925
780926
781- def fix_pointers (self , pointers : dict [str , str ]) -> dict [str , str ]:
927+ def _fix_pointers (self , pointers : dict [str , str ]) -> dict [str , str ]:
782928 """Takes a dictionary of file ids to the file ids they point to
783929 It ensures that there are no dangling nodes, writing corrections to the DB
784930 as necessary and it returns the cleaned graph.
@@ -808,7 +954,7 @@ def fix_pointers(self, pointers: dict[str, str]) -> dict[str, str]:
808954 added_down .add (n )
809955 new_pointers = self ._make_new_cycle (added_up | added_down )
810956 pointers .update (new_pointers )
811- return self .fix_pointers (pointers )
957+ return self ._fix_pointers (pointers )
812958 for k in pointers .keys ():
813959 if k not in parents :
814960 leaf = k
@@ -826,11 +972,11 @@ def fix_pointers(self, pointers: dict[str, str]) -> dict[str, str]:
826972 if leaf == k :
827973 self ._write_pointer (leaf , None )
828974 del pointers [k ]
829- return self .fix_pointers (pointers )
975+ return self ._fix_pointers (pointers )
830976 else :
831977 self ._write_pointer (leaf , k )
832978 pointers [leaf ] = k
833- return self .fix_pointers (pointers )
979+ return self ._fix_pointers (pointers )
834980 return pointers
835981
836982 def clear_distinctions_from (self , file_id : str , pointing_to : str = None ):
@@ -845,14 +991,10 @@ def clear_distinctions_from(self, file_id: str, pointing_to: str = None):
845991 del self .fileid_to_distinct_neighbors [file_id ]
846992 del self .fileid_to_distinct_neighbors [neighbor ]
847993 return
848- with self .gcache ._lock :
849- self .gcache .cursor .execute ("SELECT file_id FROM item_properties WHERE key = 'distinctFrom' AND value = ?" , (file_id ,))
850- pointing_neighbor = self .gcache .cursor .fetchone ()['file_id' ]
994+ pointing_neighbor = fetch_distinct_file_pointing_to (self .gcache , file_id )
851995 assert pointing_neighbor in neighbors
852996 if not pointing_to :
853- with self .gcache ._lock :
854- self .gcache .cursor .execute ("SELECT value FROM item_properties WHERE key = 'distinctFrom' AND file_id = ?" , (file_id ,))
855- pointing_to = self .gcache .cursor .fetchone ()['value' ]
997+ pointing_to = fetch_files_distinction_pointer (self .gcache , file_id )
856998 assert pointing_to in neighbors
857999 assert pointing_to != pointing_neighbor
8581000 self ._write_pointer (pointing_neighbor , pointing_to )
@@ -862,13 +1004,11 @@ def clear_distinctions_from(self, file_id: str, pointing_to: str = None):
8621004 del self .fileid_to_distinct_neighbors [file_id ]
8631005
8641006def move_distinctions_off_file (gc : local_gdrive .DriveCache , file_id : str ) -> None :
865- with gc ._lock :
866- gc .cursor .execute ("SELECT value FROM item_properties WHERE key = 'distinctFrom' AND file_id = ?" , (file_id ,))
867- pointing_to = gc .cursor .fetchone ()
1007+ pointing_to = fetch_files_distinction_pointer (gc , file_id )
8681008 if not pointing_to :
8691009 return
8701010 distinctions = FileDistinctionManager (gc )
871- distinctions .clear_distinctions_from (file_id , pointing_to [ 'value' ] )
1011+ distinctions .clear_distinctions_from (file_id , pointing_to )
8721012
8731013gcache .register_trash_callback (move_distinctions_off_file )
8741014
0 commit comments