buddhist-uni
diff --git a/‎scripts/coredownloader.py‎
Lines changed: 66 additions & 19 deletions b/‎scripts/coredownloader.py‎
Lines changed: 66 additions & 19 deletions
diff --git a/‎scripts/local_core.py‎
Lines changed: 12 additions & 5 deletions b/‎scripts/local_core.py‎
Lines changed: 12 additions & 5 deletions
@@ -1,39 +1,86 @@
 #!/bin/python3
 
+import json
 import journals
+from yaspin import yaspin
+from tqdm import tqdm
 from local_core import (
   CoreAPIWorksCache,
 )
+import gdrive
+import nearestpdf
+import titlematch
+import website
+with yaspin(text="Loading website..."):
+  website.load()
+
+nearestpdf.load()
 
 TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268
 # but there's a lot of mess here: other languages, review articles... how to filter?
 
-import website
-website.load()
-
-DOIS = [(
-  c.get('source_url', ''),
-  c.get('external_url', ''),
-  c.get('doi',''),
-  c.get('alternate_doi', ''),
-) for c in website.content]
-DOIS = [
-  doi.split('doi.org/')[1]
-  for doilist in DOIS
-  for doi in doilist
-  if 'doi.org/' in doi
-]
-# 95 of the above have fullText
-
 # "Buddhist" articles that aren't for download
 MDPI_PROVIDER_ID = 22080 # 351
 ANTI_TRACKING_KEYWORDS = [
-  "1556-5068", # SSRN ISSN. 236
+  "1556-5068", # SSRN ISSN. 2 # registered as query 3
   "NFTs", # 166
-  "blockchain", # 869
+  "blockchain", # 869 # registered as 4
   "documentType:review", # 1040
 ]
 
+core = CoreAPIWorksCache('/home/khbh/Desktop/core_api.db')
+
+self_similarities = []
+self_plus_similarities = []
+differences = []
+title_similarities = []
+
+for website_item in website.content:
+  if website_item.formats[0] != 'pdf':
+    continue
+  if not website_item.get("drive_links"):
+    continue
+  drive_id = gdrive.link_to_id(website_item['drive_links'][0])
+  if drive_id not in nearestpdf.gid_to_idx:
+    continue
+  dois = [
+    doi.split('doi.org/')[1]
+    for doi in [
+      website_item.get('source_url', ''),
+      website_item.get('external_url', ''),
+      website_item.get('doi',''),
+      website_item.get('alternate_doi', ''),
+      website_item.get('alternative_doi', ''),
+    ]
+    if 'doi.org/' in doi
+  ]
+  core_work = None
+  for doi in dois:
+    core_work = core.get_locally_from_doi(doi)
+    if core_work:
+      break
+  if not core_work:
+    continue
+  if not core_work['full_text']:
+    continue
+  text_plus = f"{core_work['full_text']} {core_work['title']} {core_work['abstract'] or ''}"
+  authors = ''
+  try:
+    authors = [auth['name'] for auth in json.loads(core_work['authors'])]
+  except:
+    pass
+  matches = nearestpdf.find_matching_files(
+    core_work['title'],
+    authors,
+    text_plus,
+  )
+  drive_file = nearestpdf.google_files[nearestpdf.gid_to_idx[drive_id]]
+  if len(matches) == 1 and matches[0][0]['id'] == drive_id:
+    print("Got it with confidence", matches[0][1])
+  else:
+    import ipdb; ipdb.set_trace()
+
+
 
 
 
@@ -400,8 +400,8 @@ def load_another_page_from_query(self, query_id: int) -> int:
     return ret
 
   @locked
-  def _get_local_by_doi(self, doi: str) -> dict | None:
-    """Helper to find the work with the given DOI in the local database"""
+  def get_locally_from_doi(self, doi: str) -> dict | None:
+    """If we have the work for this doi locally already, returns it, else None"""
     self.cursor.execute(
       "SELECT works.* FROM works JOIN identifiers ON works.id = identifiers.work_id WHERE identifiers.id = ? AND identifiers.id_type = 'DOI'",
       (doi, )
@@ -427,7 +427,6 @@ def _get_local_by_doi(self, doi: str) -> dict | None:
     # use a combination of citations and english score to pick one
     # Using lower ids as the tie breaker
     if len(ret) > 1:
-      print(f"WARNING: Found {len(ret)} works for DOI:{doi}")
       ret.sort(key=lambda r: r['id'])
       ret.sort(
         key=lambda r: 2*(r['en_confidence'] or 0)+(r['citation_count'] or 0),
@@ -457,7 +456,7 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
     now = current_timestamp()
 
     for i, doi in enumerate(dois):
-      work = self._get_local_by_doi(doi)
+      work = self.get_locally_from_doi(doi)
       if work:
         results[i] = work
         if verbose:
@@ -504,7 +503,7 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
 
       # For each DOI in our batch, check if we found it (via the DB lookup)
       for doi in batch:
-        work = self._get_local_by_doi(doi)
+        work = self.get_locally_from_doi(doi)
         if work:
           for idx in to_fetch[doi]:
             results[idx] = work
@@ -524,6 +523,14 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
 
     return results
 
+  @locked
+  def get_local_works_for_query(self, query_id: int) -> list[dict]:
+    self.cursor.execute(
+      "SELECT works.* FROM works JOIN query_works ON query_works.work_id = works.id WHERE query_works.query_id = ?",
+      (query_id, )
+    )
+    return [dict(row) for row in self.cursor.fetchall()]
+  
   @locked
   def close(self):
     if self.conn: