buddhist-uni
diff --git a/‎_content/articles/diversity-statements_carnes-fine-sheridan.md‎
Lines changed: 1 addition & 1 deletion b/‎_content/articles/diversity-statements_carnes-fine-sheridan.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎_content/articles/sumedhakatha-in-pali-and-the-northern-tradition_matsumura-junko.md‎
Lines changed: 1 addition & 1 deletion b/‎_content/articles/sumedhakatha-in-pali-and-the-northern-tradition_matsumura-junko.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎_content/av/bowie-jazz-piano_harford-tim.md‎
Lines changed: 0 additions & 1 deletion b/‎_content/av/bowie-jazz-piano_harford-tim.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎scripts/bulk_import.py‎
Lines changed: 33 additions & 21 deletions b/‎scripts/bulk_import.py‎
Lines changed: 33 additions & 21 deletions
diff --git a/‎scripts/coredownloader.py‎
Lines changed: 4 additions & 2 deletions b/‎scripts/coredownloader.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎scripts/downloadutils.py‎
Lines changed: 10 additions & 2 deletions b/‎scripts/downloadutils.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎scripts/local_core.py‎
Lines changed: 138 additions & 2 deletions b/‎scripts/local_core.py‎
Lines changed: 138 additions & 2 deletions
@@ -5,7 +5,7 @@ authors:
   - "Eve Fine"
   - "Jennifer Sheridan"
 external_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6309930/"
-source_url: "https://journals.lww.com/academicmedicine/Fulltext/2019/01000/Promises_and_Pitfalls_of_Diversity_Statements_.13.aspx"
+source_url: "https://academic.oup.com/academicmedicine/article-pdf/94/1/20/65953397/20190100.0-00013.pdf"
 drive_links:
   - "https://drive.google.com/file/d/1daNVmePy1PrvgBMtoicBlNwhmngJriKb/view?usp=drivesdk"
 file_links:
 
@@ -2,7 +2,7 @@
 title: "The *Sumedhakathā* in Pāli Literature and Its Relation to the Northern Buddhist Textual Tradition"
 authors:
   - "Junko Matsumura"
-external_url: "https://core.ac.uk/download/pdf/290124321.pdf"
+external_url: "https://icabs.repo.nii.ac.jp/?action=repository_action_common_download&item_id=42&item_no=1&attribute_id=22&file_no=1"
 drive_links:
   - "https://drive.google.com/file/d/1wlajSSbBDkBPbIxJYlwGWzgE8wHlf_4T/view?usp=drivesdk"
 course: buddha
 
@@ -6,7 +6,6 @@ subcat: podcast
 editor: "Julia Barton"
 external_url: "https://www.pushkin.fm/podcasts/cautionary-tales/bowie-jazz-and-the-unplayable-piano"
 source_url: "https://timharford.com/2019/12/cautionary-tales-ep-7-bowie-jazz-and-the-unplayable-piano/"
-alternate_url: "https://soundcloud.com/blinkist/cautionary-tales-bowie-jazz-and-the-unplayable-piano"
 course: world
 tags:
   - music
 
@@ -6,6 +6,7 @@
   from collections import defaultdict
   import argparse
   import re
+  import enum
   import threading
   import requests
   import joblib
@@ -31,7 +32,8 @@
   LINK_SAVER = "LibraryUtils.LinkSaver"
   PDF_SAVER = "LibraryUtils.BulkPDFImporter"
 
-course_predictor = None
+with yaspin(text="Loading tag predictor..."):
+  course_predictor = TagPredictor.load()
 disk_memorizor = joblib.Memory(gdrive.gcache_folder, verbose=0)
 
 def synchronized(func):
@@ -133,16 +135,23 @@ def get_folder_id_for_course(self, course:str) -> str:
       self.unread_folderid_for_course[course] = subfolder
       return subfolder
 
+class BulkPDFType(enum.StrEnum):
+  ACADEMIA_EDU = 'academia.edu'
+  TO_GO_THROUGH = 'togothrough'
+  CORE_API = 'coreapi'
+
 class BulkPDFImporter(BulkItemImporter):
-  def __init__(self, pdf_type) -> None:
+  def __init__(self, pdf_type: BulkPDFType) -> None:
     super().__init__()
     self.pdf_type = pdf_type
     match pdf_type:
       # Make sure to update gdrive.select_ids_to_keep as well
-      case 'academia.edu':
+      case BulkPDFType.ACADEMIA_EDU:
         self.folder_name = "🏛️ Academia.edu"
-      case 'togothrough':
+      case BulkPDFType.TO_GO_THROUGH:
         self.folder_name = "📥 To Go Through"
+      case BulkPDFType.CORE_API:
+        self.folder_name = "🔓 CORE API"
       case _:
         raise ValueError("Invalid PDF type: "+pdf_type)
 
@@ -153,9 +162,25 @@ def can_import_item(self, item: str) -> bool:
     return item.lower().endswith('.pdf') \
       and Path(item).is_file() # so far, only support local files
 
-  def import_items(self, items: list[str]):
+  def import_item(self, item: Path, verbose: bool) -> str | None:
+    text = normalize_text(readpdf(item, normalize=0))
+    name = normalize_text((' '+item.stem) * 3)
+    course = course_predictor.predict([text+name], normalized=True)[0]
+    folder = self.get_folder_id_for_course(course)
+    ret = gdrive_base.upload_to_google_drive(
+      item,
+      folder_id=folder,
+      filename=item.name,
+      creator=PDF_SAVER,
+      verbose=verbose,
+    )
+    if ret:
+      save_normalized_text(ret, text)
+    return ret
+  
+  def import_items(self, items: list[str | Path]):
     files = [Path(item) for item in items]
-    if self.pdf_type == "academia.edu":
+    if self.pdf_type == BulkPDFType.ACADEMIA_EDU:
       """Academia.edu PDFs use _s instead of spaces
       Replace them with spaces for my sanity"""
       for fp in list(files):
@@ -171,20 +196,9 @@ def import_items(self, items: list[str]):
         tqdm.write(f"Skipping {fp} as that file is already on Drive!")
         fp.unlink()
         continue
-      text = normalize_text(readpdf(fp, normalize=0))
-      name = normalize_text((' '+fp.stem) * 3)
-      course = course_predictor.predict([text+name], normalized=True)[0]
-      folder = self.get_folder_id_for_course(course)
-      uploaded = gdrive_base.upload_to_google_drive(
-        fp,
-        folder_id=folder,
-        filename=fp.name,
-        creator=PDF_SAVER,
-        verbose=False,
-      )
+      uploaded = self.import_item(item, False)
       if uploaded:
         fp.unlink()
-        save_normalized_text(uploaded, text)
       else:
         tqdm.write(f"Failed to upload {fp}!")
 
@@ -747,7 +761,7 @@ def resort_existing_pdfs_of_type(pdf_type: str):
     '--pdf-type',
     dest="pdf_type",
     nargs="?",
-    choices=['academia.edu', 'togothrough'],
+    choices=[str(v) for v in BulkPDFType],
     help="Which subfolder to sort PDFs into (required if importing PDFs)",
   )
   argparser.add_argument(
@@ -760,8 +774,6 @@ def resort_existing_pdfs_of_type(pdf_type: str):
     If no pdf-type is, then it'll resort the link docs.""",
   )
   args = argparser.parse_args()
-  with yaspin(text="Loading tag predictor..."):
-    course_predictor = TagPredictor.load()
   if args.resort:
     if args.pdf_type:
       resort_existing_pdfs_of_type(args.pdf_type)
 
@@ -72,6 +72,8 @@
     continue
   core.register_gfile_for_work(core_work['id'], drive_id, similarity=0.99)
 
-while core.load_another_page_from_query(TRACKING_QUERY) > 0:
-  print("Loading another page...")
+while True:
+  core.attempt_downloads_for_query(TRACKING_QUERY)
+  if core.load_another_page_from_query(TRACKING_QUERY) <= 0:
+    break
 
@@ -187,9 +187,17 @@ def download(url: str, filename: str, expected_type=None) -> bool:
         r.close()
         # some servers dislike streaming/sniffing and prefer you dl in one go
         r = requests.get(url, headers=REQUEST_HEADERS, timeout=30)
+        if r.content.startswith(firstchunk):
+          spinner.text = "Got it!"
+          spinner.ok("( ^.^  )")
+        else:
+          spinner.text = "Got a different file the second time around"
+          spinner.fail("( T.T )")
+          if expected_type == 'pdf' and r.content.startswith(b"%PDF-"):
+            spinner.text = "Got a different file, but it might still be ok..."
+          else:
+            return False
         fd.write(r.content)
-        spinner.text = "Got it!"
-        spinner.ok("( ^.^  )")
       print(f"{len(r.content)} bytes this time")
       return True
   except Exception as e:
 
@@ -2,16 +2,19 @@
 
 import requests
 import sqlite3
+import tempfile
 import json
 import re
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from pathlib import Path
 import threading
 from time import sleep
 from enum import IntEnum
 from language_detection import LANGUAGE_DETECTOR, Language
-from strutils import author_name_to_normal
+from strutils import author_name_to_normal, md5
 import nearestpdf
+from tqdm import tqdm
+from downloadutils import download, pdf_name_for_work
 
 # Maybe a better place to put this mutual dependency?
 from local_gdrive import locked
@@ -604,6 +607,139 @@ def match_gfiles_to_local_works(self):
         found += 1
     print(f"Found Google Drive files for {found} works and added them to the DB")
 
+  @locked
+  def mark_download(self, work_id: str, success: bool, timestamp: int=None):
+    if not timestamp:
+      timestamp = current_timestamp()
+    if not success:
+      timestamp = -timestamp
+    self.cursor.execute("""
+      UPDATE works SET downloaded_date = ? WHERE id = ?
+      """,
+      (timestamp, work_id,)
+    )
+    self.conn.commit()
+  
+  def _attempt_to_download(self, work: dict | sqlite3.Row, to_folder: Path) -> Path | None:
+    work = dict(work)
+    filename = pdf_name_for_work(work)
+    outpath = to_folder.joinpath(filename)
+    with self._lock:
+      source = self.cursor.execute("""
+          SELECT id FROM identifiers WHERE work_id = ? AND id_type = 'SOURCE_URL' LIMIT 1
+        """,
+        (work['id'], )
+      ).fetchone()
+    if source:
+      succ = download(source['id'], outpath, expected_type='pdf')
+      if succ:
+        return outpath
+    if work.get('download_url'):
+      output_id = re.fullmatch(
+        r'https:\/\/core.ac.uk\/download\/(?:pdf\/)?([0-9]+).pdf',
+        work['download_url'],
+      ).group(1)
+      output = call_api(f"outputs/{output_id}", {})
+      for url in output.get('urls', []):
+        if url == source['id']:
+          continue
+        succ = download(url, outpath, expected_type='pdf')
+        if succ:
+          return outpath
+      if output.get('downloadUrl'):
+        succ = download(output['downloadUrl'], outpath, expected_type='pdf')
+        if succ:
+          return outpath
+      if work['download_url'] != output.get('downloadUrl'):
+        succ = download(work['download_url'], outpath, expected_type='pdf')
+        if succ:
+          return outpath
+    return None
+  
+  def attempt_downloads_for_query(self, query_id: int, to_folder: Path=None, min_en_conf: float=0.8, min_drive_conf: float=0.6, retry_timedelta: int | timedelta=15811200000) -> int:
+    """
+    Args:
+      to_folder: If you'd like to keep the downloaded files, supply a folder.
+      Otherwise won't it keep them
+    """
+    works = self.get_local_works_for_query(query_id)
+    # Filter out non-English works
+    works = [work for work in works if work['en_confidence'] >= min_en_conf]
+    # Filter out works that we downloaded successfully or tried recently
+    if isinstance(retry_timedelta, timedelta):
+      retry_timedelta = int(retry_timedelta.total_seconds() * 1000)
+    since = -(current_timestamp() - retry_timedelta)
+    works = [work for work in works if work.get('downloaded_date') is None or (work['downloaded_date'] <= 0 and work['downloaded_date'] > since)]
+    # Filter out works we already have on Drive
+    with self._lock:
+      works = [
+        work for work in works if
+        self.cursor.execute(
+          "SELECT * FROM work_gfiles WHERE work_id = ? AND pval > ? LIMIT 1",
+          (work['id'], min_drive_conf, )
+        ).fetchone() is None
+      ]
+    print(f"Attempting to download {len(works)} works from query {query_id}...")
+    pbar = tqdm(works)
+    ret = 0
+    import pypdf.errors
+    from pdfutils import readpdf
+    from bulk_import import BulkPDFImporter, BulkPDFType
+    import gdrive
+    import nearestpdf
+    nearestpdf.load()
+    importer = BulkPDFImporter(BulkPDFType.CORE_API)
+    with tempfile.TemporaryDirectory() as temp_dir:
+      if not to_folder:
+        to_folder = Path(temp_dir)
+      for work in pbar:
+        succ = self._attempt_to_download(work, to_folder)
+        if succ:
+          self.mark_download(work['id'], True)
+          hash = md5(succ)
+          existing = gdrive.gcache.get_items_with_md5(hash)
+          if not existing:
+            existing = gdrive.gcache.get_trashed_items_with_md5(hash)
+          if not existing:
+            authors = work['authors']
+            if isinstance(authors, str):
+              authors = json.loads(authors)
+            assert isinstance(authors, list)
+            authors = [author_name_to_normal(author['name']) for author in authors]
+            try:
+              fuzzy_dupes = nearestpdf.find_matching_files(work['title'], authors, readpdf(succ))
+            except (pypdf.errors.PdfReadError, pypdf.errors.PdfStreamError):
+              pbar.write("Didn't get a valid PDF :(")
+              self.mark_download(work['id'], False)
+              continue
+            if fuzzy_dupes:
+              pbar.write(f"Found a fuzzy duplicate for \"{succ}\" on GDrive: \"{fuzzy_dupes[0][0]['name']}\"")
+              skip_upload = False
+              for dupe in fuzzy_dupes:
+                self.register_gfile_for_work(work['id'], dupe[0]['id'], dupe[1])
+                if dupe[1] > min_drive_conf:
+                  skip_upload = True
+              if skip_upload:
+                pbar.write(f"  Uploading straight to old versions...")
+                file_id = gdrive.gcache.upload_file(
+                  succ,
+                  folder_id=gdrive.OLD_VERSIONS_FOLDER_ID,
+                )
+                if file_id:
+                  self.register_gfile_for_work(work['id'], file_id, 1)
+                continue
+          if existing:
+            file_id = existing[0]['id']
+          else:
+            ret += 1
+            file_id = importer.import_item(succ, True)
+          assert file_id is not None, f"Failed to upload {succ}"
+          self.register_gfile_for_work(work['id'], file_id, 1)
+        else:
+          self.mark_download(work['id'], False)
+
+    return ret
+  
   @locked
   def close(self):
     if self.conn: