-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathcoredownloader.py
More file actions
76 lines (65 loc) · 2.29 KB
/
coredownloader.py
File metadata and controls
76 lines (65 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/python3
import journals
from yaspin import yaspin
from local_core import (
CoreAPIWorksCache,
)
import gdrive
import nearestpdf
import website
from tag_predictor import DATA_DIRECTORY
with yaspin(text="Loading website..."):
website.load()
nearestpdf.load()
TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268
# but there's a lot of mess here: other languages, review articles... how to filter?
# "Buddhist" articles that aren't for download
ANTI_TRACKING_KEYWORDS = [
"\"1556-5068\"", # SSRN ISSN. 2 # registered as query 3
"NFTs", # 166
"blockchain", # 869 # registered as 4
"documentType:review", # 1040
"title:review",
]
TRACKING_QUERY_STR = "(title:Buddhist OR abstract:Buddhist) AND fullText:Buddhist"
TRACKING_QUERY_STR += "".join(f" AND -{anti}" for anti in ANTI_TRACKING_KEYWORDS)
core = CoreAPIWorksCache(DATA_DIRECTORY.joinpath('core_api.db'))
TRACKING_QUERY = core.register_query(TRACKING_QUERY_STR)
LOCAL_DOIS: dict[str, str]
LOCAL_DOIS = dict() # mapping doi to drive file id
for website_item in website.content:
if website_item.formats[0] != 'pdf':
continue
if not website_item.get("drive_links"):
continue
drive_id = gdrive.link_to_id(website_item['drive_links'][0])
if drive_id not in nearestpdf.gid_to_idx:
continue
dois = [
doi.split('doi.org/')[1]
for doi in [
website_item.get('source_url', ''),
website_item.get('external_url', ''),
website_item.get('doi',''),
website_item.get('alternate_doi', ''),
website_item.get('alternative_doi', ''),
]
if 'doi.org/' in doi
]
core_work = None
for doi in dois:
LOCAL_DOIS[doi] = drive_id
print(f"Attempting to fetch {len(LOCAL_DOIS)} works by DOI...")
bulk_works = core.bulk_get_by_doi(list(LOCAL_DOIS.keys()), max_per_batch=100)
print(f"Associating {len(bulk_works)} Drive files from the website with their CORE Works...")
# This ensures that we won't try to download anything the website knows about
del bulk_works
for doi, drive_id in LOCAL_DOIS.items():
core_work = core.get_locally_from_doi(doi)
if not core_work:
continue
core.register_gfile_for_work(core_work['id'], drive_id, similarity=0.99)
while True:
core.attempt_downloads_for_query(TRACKING_QUERY)
if core.load_another_page_from_query(TRACKING_QUERY) <= 0:
break