Skip to content

Commit e7cff0f

Browse files
committed
fix this months links
1 parent 4c4bc01 commit e7cff0f

7 files changed

Lines changed: 187 additions & 30 deletions

‎_content/articles/diversity-statements_carnes-fine-sheridan.md‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ authors:
55
- "Eve Fine"
66
- "Jennifer Sheridan"
77
external_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6309930/"
8-
source_url: "https://journals.lww.com/academicmedicine/Fulltext/2019/01000/Promises_and_Pitfalls_of_Diversity_Statements_.13.aspx"
8+
source_url: "https://academic.oup.com/academicmedicine/article-pdf/94/1/20/65953397/20190100.0-00013.pdf"
99
drive_links:
1010
- "https://drive.google.com/file/d/1daNVmePy1PrvgBMtoicBlNwhmngJriKb/view?usp=drivesdk"
1111
file_links:

‎_content/articles/sumedhakatha-in-pali-and-the-northern-tradition_matsumura-junko.md‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
title: "The *Sumedhakathā* in Pāli Literature and Its Relation to the Northern Buddhist Textual Tradition"
33
authors:
44
- "Junko Matsumura"
5-
external_url: "https://core.ac.uk/download/pdf/290124321.pdf"
5+
external_url: "https://icabs.repo.nii.ac.jp/?action=repository_action_common_download&item_id=42&item_no=1&attribute_id=22&file_no=1"
66
drive_links:
77
- "https://drive.google.com/file/d/1wlajSSbBDkBPbIxJYlwGWzgE8wHlf_4T/view?usp=drivesdk"
88
course: buddha

‎_content/av/bowie-jazz-piano_harford-tim.md‎

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ subcat: podcast
66
editor: "Julia Barton"
77
external_url: "https://www.pushkin.fm/podcasts/cautionary-tales/bowie-jazz-and-the-unplayable-piano"
88
source_url: "https://timharford.com/2019/12/cautionary-tales-ep-7-bowie-jazz-and-the-unplayable-piano/"
9-
alternate_url: "https://soundcloud.com/blinkist/cautionary-tales-bowie-jazz-and-the-unplayable-piano"
109
course: world
1110
tags:
1211
- music

‎scripts/bulk_import.py‎

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from collections import defaultdict
77
import argparse
88
import re
9+
import enum
910
import threading
1011
import requests
1112
import joblib
@@ -31,7 +32,8 @@
3132
LINK_SAVER = "LibraryUtils.LinkSaver"
3233
PDF_SAVER = "LibraryUtils.BulkPDFImporter"
3334

34-
course_predictor = None
35+
with yaspin(text="Loading tag predictor..."):
36+
course_predictor = TagPredictor.load()
3537
disk_memorizor = joblib.Memory(gdrive.gcache_folder, verbose=0)
3638

3739
def synchronized(func):
@@ -133,16 +135,23 @@ def get_folder_id_for_course(self, course:str) -> str:
133135
self.unread_folderid_for_course[course] = subfolder
134136
return subfolder
135137

138+
class BulkPDFType(enum.StrEnum):
139+
ACADEMIA_EDU = 'academia.edu'
140+
TO_GO_THROUGH = 'togothrough'
141+
CORE_API = 'coreapi'
142+
136143
class BulkPDFImporter(BulkItemImporter):
137-
def __init__(self, pdf_type) -> None:
144+
def __init__(self, pdf_type: BulkPDFType) -> None:
138145
super().__init__()
139146
self.pdf_type = pdf_type
140147
match pdf_type:
141148
# Make sure to update gdrive.select_ids_to_keep as well
142-
case 'academia.edu':
149+
case BulkPDFType.ACADEMIA_EDU:
143150
self.folder_name = "🏛️ Academia.edu"
144-
case 'togothrough':
151+
case BulkPDFType.TO_GO_THROUGH:
145152
self.folder_name = "📥 To Go Through"
153+
case BulkPDFType.CORE_API:
154+
self.folder_name = "🔓 CORE API"
146155
case _:
147156
raise ValueError("Invalid PDF type: "+pdf_type)
148157

@@ -153,9 +162,25 @@ def can_import_item(self, item: str) -> bool:
153162
return item.lower().endswith('.pdf') \
154163
and Path(item).is_file() # so far, only support local files
155164

156-
def import_items(self, items: list[str]):
165+
def import_item(self, item: Path, verbose: bool) -> str | None:
166+
text = normalize_text(readpdf(item, normalize=0))
167+
name = normalize_text((' '+item.stem) * 3)
168+
course = course_predictor.predict([text+name], normalized=True)[0]
169+
folder = self.get_folder_id_for_course(course)
170+
ret = gdrive_base.upload_to_google_drive(
171+
item,
172+
folder_id=folder,
173+
filename=item.name,
174+
creator=PDF_SAVER,
175+
verbose=verbose,
176+
)
177+
if ret:
178+
save_normalized_text(ret, text)
179+
return ret
180+
181+
def import_items(self, items: list[str | Path]):
157182
files = [Path(item) for item in items]
158-
if self.pdf_type == "academia.edu":
183+
if self.pdf_type == BulkPDFType.ACADEMIA_EDU:
159184
"""Academia.edu PDFs use _s instead of spaces
160185
Replace them with spaces for my sanity"""
161186
for fp in list(files):
@@ -171,20 +196,9 @@ def import_items(self, items: list[str]):
171196
tqdm.write(f"Skipping {fp} as that file is already on Drive!")
172197
fp.unlink()
173198
continue
174-
text = normalize_text(readpdf(fp, normalize=0))
175-
name = normalize_text((' '+fp.stem) * 3)
176-
course = course_predictor.predict([text+name], normalized=True)[0]
177-
folder = self.get_folder_id_for_course(course)
178-
uploaded = gdrive_base.upload_to_google_drive(
179-
fp,
180-
folder_id=folder,
181-
filename=fp.name,
182-
creator=PDF_SAVER,
183-
verbose=False,
184-
)
199+
uploaded = self.import_item(item, False)
185200
if uploaded:
186201
fp.unlink()
187-
save_normalized_text(uploaded, text)
188202
else:
189203
tqdm.write(f"Failed to upload {fp}!")
190204

@@ -747,7 +761,7 @@ def resort_existing_pdfs_of_type(pdf_type: str):
747761
'--pdf-type',
748762
dest="pdf_type",
749763
nargs="?",
750-
choices=['academia.edu', 'togothrough'],
764+
choices=[str(v) for v in BulkPDFType],
751765
help="Which subfolder to sort PDFs into (required if importing PDFs)",
752766
)
753767
argparser.add_argument(
@@ -760,8 +774,6 @@ def resort_existing_pdfs_of_type(pdf_type: str):
760774
If no pdf-type is, then it'll resort the link docs.""",
761775
)
762776
args = argparser.parse_args()
763-
with yaspin(text="Loading tag predictor..."):
764-
course_predictor = TagPredictor.load()
765777
if args.resort:
766778
if args.pdf_type:
767779
resort_existing_pdfs_of_type(args.pdf_type)

‎scripts/coredownloader.py‎

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@
7272
continue
7373
core.register_gfile_for_work(core_work['id'], drive_id, similarity=0.99)
7474

75-
while core.load_another_page_from_query(TRACKING_QUERY) > 0:
76-
print("Loading another page...")
75+
while True:
76+
core.attempt_downloads_for_query(TRACKING_QUERY)
77+
if core.load_another_page_from_query(TRACKING_QUERY) <= 0:
78+
break
7779

‎scripts/downloadutils.py‎

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,17 @@ def download(url: str, filename: str, expected_type=None) -> bool:
187187
r.close()
188188
# some servers dislike streaming/sniffing and prefer you dl in one go
189189
r = requests.get(url, headers=REQUEST_HEADERS, timeout=30)
190+
if r.content.startswith(firstchunk):
191+
spinner.text = "Got it!"
192+
spinner.ok("( ^.^ )")
193+
else:
194+
spinner.text = "Got a different file the second time around"
195+
spinner.fail("( T.T )")
196+
if expected_type == 'pdf' and r.content.startswith(b"%PDF-"):
197+
spinner.text = "Got a different file, but it might still be ok..."
198+
else:
199+
return False
190200
fd.write(r.content)
191-
spinner.text = "Got it!"
192-
spinner.ok("( ^.^ )")
193201
print(f"{len(r.content)} bytes this time")
194202
return True
195203
except Exception as e:

‎scripts/local_core.py‎

Lines changed: 138 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@
22

33
import requests
44
import sqlite3
5+
import tempfile
56
import json
67
import re
7-
from datetime import datetime, timezone
8+
from datetime import datetime, timezone, timedelta
89
from pathlib import Path
910
import threading
1011
from time import sleep
1112
from enum import IntEnum
1213
from language_detection import LANGUAGE_DETECTOR, Language
13-
from strutils import author_name_to_normal
14+
from strutils import author_name_to_normal, md5
1415
import nearestpdf
16+
from tqdm import tqdm
17+
from downloadutils import download, pdf_name_for_work
1518

1619
# Maybe a better place to put this mutual dependency?
1720
from local_gdrive import locked
@@ -604,6 +607,139 @@ def match_gfiles_to_local_works(self):
604607
found += 1
605608
print(f"Found Google Drive files for {found} works and added them to the DB")
606609

610+
@locked
611+
def mark_download(self, work_id: str, success: bool, timestamp: int=None):
612+
if not timestamp:
613+
timestamp = current_timestamp()
614+
if not success:
615+
timestamp = -timestamp
616+
self.cursor.execute("""
617+
UPDATE works SET downloaded_date = ? WHERE id = ?
618+
""",
619+
(timestamp, work_id,)
620+
)
621+
self.conn.commit()
622+
623+
def _attempt_to_download(self, work: dict | sqlite3.Row, to_folder: Path) -> Path | None:
624+
work = dict(work)
625+
filename = pdf_name_for_work(work)
626+
outpath = to_folder.joinpath(filename)
627+
with self._lock:
628+
source = self.cursor.execute("""
629+
SELECT id FROM identifiers WHERE work_id = ? AND id_type = 'SOURCE_URL' LIMIT 1
630+
""",
631+
(work['id'], )
632+
).fetchone()
633+
if source:
634+
succ = download(source['id'], outpath, expected_type='pdf')
635+
if succ:
636+
return outpath
637+
if work.get('download_url'):
638+
output_id = re.fullmatch(
639+
r'https:\/\/core.ac.uk\/download\/(?:pdf\/)?([0-9]+).pdf',
640+
work['download_url'],
641+
).group(1)
642+
output = call_api(f"outputs/{output_id}", {})
643+
for url in output.get('urls', []):
644+
if url == source['id']:
645+
continue
646+
succ = download(url, outpath, expected_type='pdf')
647+
if succ:
648+
return outpath
649+
if output.get('downloadUrl'):
650+
succ = download(output['downloadUrl'], outpath, expected_type='pdf')
651+
if succ:
652+
return outpath
653+
if work['download_url'] != output.get('downloadUrl'):
654+
succ = download(work['download_url'], outpath, expected_type='pdf')
655+
if succ:
656+
return outpath
657+
return None
658+
659+
def attempt_downloads_for_query(self, query_id: int, to_folder: Path=None, min_en_conf: float=0.8, min_drive_conf: float=0.6, retry_timedelta: int | timedelta=15811200000) -> int:
660+
"""
661+
Args:
662+
to_folder: If you'd like to keep the downloaded files, supply a folder.
663+
Otherwise won't it keep them
664+
"""
665+
works = self.get_local_works_for_query(query_id)
666+
# Filter out non-English works
667+
works = [work for work in works if work['en_confidence'] >= min_en_conf]
668+
# Filter out works that we downloaded successfully or tried recently
669+
if isinstance(retry_timedelta, timedelta):
670+
retry_timedelta = int(retry_timedelta.total_seconds() * 1000)
671+
since = -(current_timestamp() - retry_timedelta)
672+
works = [work for work in works if work.get('downloaded_date') is None or (work['downloaded_date'] <= 0 and work['downloaded_date'] > since)]
673+
# Filter out works we already have on Drive
674+
with self._lock:
675+
works = [
676+
work for work in works if
677+
self.cursor.execute(
678+
"SELECT * FROM work_gfiles WHERE work_id = ? AND pval > ? LIMIT 1",
679+
(work['id'], min_drive_conf, )
680+
).fetchone() is None
681+
]
682+
print(f"Attempting to download {len(works)} works from query {query_id}...")
683+
pbar = tqdm(works)
684+
ret = 0
685+
import pypdf.errors
686+
from pdfutils import readpdf
687+
from bulk_import import BulkPDFImporter, BulkPDFType
688+
import gdrive
689+
import nearestpdf
690+
nearestpdf.load()
691+
importer = BulkPDFImporter(BulkPDFType.CORE_API)
692+
with tempfile.TemporaryDirectory() as temp_dir:
693+
if not to_folder:
694+
to_folder = Path(temp_dir)
695+
for work in pbar:
696+
succ = self._attempt_to_download(work, to_folder)
697+
if succ:
698+
self.mark_download(work['id'], True)
699+
hash = md5(succ)
700+
existing = gdrive.gcache.get_items_with_md5(hash)
701+
if not existing:
702+
existing = gdrive.gcache.get_trashed_items_with_md5(hash)
703+
if not existing:
704+
authors = work['authors']
705+
if isinstance(authors, str):
706+
authors = json.loads(authors)
707+
assert isinstance(authors, list)
708+
authors = [author_name_to_normal(author['name']) for author in authors]
709+
try:
710+
fuzzy_dupes = nearestpdf.find_matching_files(work['title'], authors, readpdf(succ))
711+
except (pypdf.errors.PdfReadError, pypdf.errors.PdfStreamError):
712+
pbar.write("Didn't get a valid PDF :(")
713+
self.mark_download(work['id'], False)
714+
continue
715+
if fuzzy_dupes:
716+
pbar.write(f"Found a fuzzy duplicate for \"{succ}\" on GDrive: \"{fuzzy_dupes[0][0]['name']}\"")
717+
skip_upload = False
718+
for dupe in fuzzy_dupes:
719+
self.register_gfile_for_work(work['id'], dupe[0]['id'], dupe[1])
720+
if dupe[1] > min_drive_conf:
721+
skip_upload = True
722+
if skip_upload:
723+
pbar.write(f" Uploading straight to old versions...")
724+
file_id = gdrive.gcache.upload_file(
725+
succ,
726+
folder_id=gdrive.OLD_VERSIONS_FOLDER_ID,
727+
)
728+
if file_id:
729+
self.register_gfile_for_work(work['id'], file_id, 1)
730+
continue
731+
if existing:
732+
file_id = existing[0]['id']
733+
else:
734+
ret += 1
735+
file_id = importer.import_item(succ, True)
736+
assert file_id is not None, f"Failed to upload {succ}"
737+
self.register_gfile_for_work(work['id'], file_id, 1)
738+
else:
739+
self.mark_download(work['id'], False)
740+
741+
return ret
742+
607743
@locked
608744
def close(self):
609745
if self.conn:

0 commit comments

Comments
 (0)