buddhist-uni.github.io/scripts/coredownloader.py at main · buddhist-uni/buddhist-uni.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/python3

import journals
from yaspin import yaspin
from local_core import (
  CoreAPIWorksCache,
)
import gdrive
import nearestpdf
import website
from tag_predictor import DATA_DIRECTORY
with yaspin(text="Loading website..."):
  website.load()

nearestpdf.load()

TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268
# but there's a lot of mess here: other languages, review articles... how to filter?

# "Buddhist" articles that aren't for download
ANTI_TRACKING_KEYWORDS = [
  "\"1556-5068\"", # SSRN ISSN. 2 # registered as query 3
  "NFTs", # 166
  "blockchain", # 869 # registered as 4
  "documentType:review", # 1040
  "title:review",
]
TRACKING_QUERY_STR = "(title:Buddhist OR abstract:Buddhist) AND fullText:Buddhist"
TRACKING_QUERY_STR += "".join(f" AND -{anti}" for anti in ANTI_TRACKING_KEYWORDS)

core = CoreAPIWorksCache(DATA_DIRECTORY.joinpath('core_api.db'))
TRACKING_QUERY = core.register_query(TRACKING_QUERY_STR)

LOCAL_DOIS: dict[str, str]
LOCAL_DOIS = dict() # mapping doi to drive file id

for website_item in website.content:
  if website_item.formats[0] != 'pdf':
    continue
  if not website_item.get("drive_links"):
    continue
  drive_id = gdrive.link_to_id(website_item['drive_links'][0])
  if drive_id not in nearestpdf.gid_to_idx:
    continue
  dois = [
    doi.split('doi.org/')[1]
    for doi in [
      website_item.get('source_url', ''),
      website_item.get('external_url', ''),
      website_item.get('doi',''),
      website_item.get('alternate_doi', ''),
      website_item.get('alternative_doi', ''),
    ]
    if 'doi.org/' in doi
  ]
  core_work = None
  for doi in dois:
    LOCAL_DOIS[doi] = drive_id

print(f"Attempting to fetch {len(LOCAL_DOIS)} works by DOI...")
bulk_works = core.bulk_get_by_doi(list(LOCAL_DOIS.keys()), max_per_batch=100)

print(f"Associating {len(bulk_works)} Drive files from the website with their CORE Works...")
# This ensures that we won't try to download anything the website knows about
del bulk_works
for doi, drive_id in LOCAL_DOIS.items():
  core_work = core.get_locally_from_doi(doi)
  if not core_work:
    continue
  core.register_gfile_for_work(core_work['id'], drive_id, similarity=0.99)

while True:
  core.attempt_downloads_for_query(TRACKING_QUERY)
  if core.load_another_page_from_query(TRACKING_QUERY) <= 0:
    break