Skip to content
80 changes: 80 additions & 0 deletions _data/drive_folders.json
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@
"private": "https://drive.google.com/drive/folders/1vcXW9do9cfwjvSuDqeXWTpuvufeL3dGL",
"public": ""
},
"animal-conservation": {
"private": "https://drive.google.com/drive/folders/1PAFwF4lQVsNbg5l4By-omB80itZ8ewlQ",
"public": ""
},
"animalia": {
"private": "https://drive.google.com/drive/folders/1jA5jol1PjVFOanuTTxwlCzF0j64byxHf",
"public": null
Expand Down Expand Up @@ -283,6 +287,10 @@
"private": "https://drive.google.com/drive/folders/1n1tZXEf0S-6cUUeceXFi-337asNsR1JY",
"public": ""
},
"benin": {
"private": "https://drive.google.com/drive/folders/1EAxx45nXFgbMEkCEAo-wTclkHZ65JHuo",
"public": ""
},
"bhikkhuni": {
"private": "https://drive.google.com/drive/folders/1Fh5zkpjAKK9w8xfE_o_Hmw6JRWekbYlu",
"public": "https://drive.google.com/drive/folders/1A6uv_Ld70C_c-53dTYrnnDRy3HCCVlLI"
Expand All @@ -303,6 +311,10 @@
"private": "https://drive.google.com/drive/folders/1b0V7s-k9u6uR-9qhQgD4x8NtElCa2Ik0",
"public": null
},
"birds": {
"private": "https://drive.google.com/drive/folders/13ceLzHA6s8kc3NHcJvcb6aDTb4BGwSbH",
"public": ""
},
"bodhisatta": {
"private": "https://drive.google.com/drive/folders/1GOfwfQn3oi3RCslGBjEQKWmz-uA8nd3o",
"public": ""
Expand Down Expand Up @@ -499,6 +511,10 @@
"private": "https://drive.google.com/drive/folders/1_VpTPpiOFxjt59MOiSHW0R1K4GkfDdUR",
"public": ""
},
"champa": {
"private": "https://drive.google.com/drive/folders/1OLj1dJkCfw5D7pk2ri5lIBkUSeuVcmdL",
"public": ""
},
"chan-lit": {
"private": "https://drive.google.com/drive/folders/10lMrtPr2cS9L3upKbGemKA-P4P7XqKDT",
"public": "https://drive.google.com/drive/folders/18EmB7syXruR9KBEOJJJ7y7mhjJOqsTcT"
Expand Down Expand Up @@ -839,6 +855,10 @@
"private": "https://drive.google.com/drive/folders/13hP6iWUE-AF_J6XZ8Cla-K_ILEmAo6UV",
"public": null
},
"domestic-violence": {
"private": "https://drive.google.com/drive/folders/1em-tbUZsO-On8WHRM3M6k59TsNYhqd9_",
"public": ""
},
"drama": {
"private": "https://drive.google.com/drive/folders/1nK3oSFLTF7SbrvqFol6CQVl5XEc6MRtK",
"public": ""
Expand Down Expand Up @@ -1159,6 +1179,10 @@
"private": "https://drive.google.com/drive/folders/1alayj8MBYPEsh1dUwr3giryT_QdGi1mQ",
"public": ""
},
"grammar": {
"private": "https://drive.google.com/drive/folders/1oWr4endCy9Yv2ZTNMB3vRBKTpJB0sZFB",
"public": ""
},
"graphic-design": {
"private": "https://drive.google.com/drive/folders/1LiBVfkodIczAfczOd1m3bE6FUPSph99y",
"public": ""
Expand Down Expand Up @@ -1207,6 +1231,10 @@
"private": "https://drive.google.com/drive/folders/1BZB6uB21ZvDNbddGHzS-flFvSCwrtvcE",
"public": null
},
"han-dynasty": {
"private": "https://drive.google.com/drive/folders/1cOJlUEllrJ2oNoKg4IhGRDWEK-kED86_",
"public": ""
},
"hawaiian": {
"private": "https://drive.google.com/drive/folders/1QeRUxesBjgcaFWoqNhPShI0su4_XEUDi",
"public": ""
Expand Down Expand Up @@ -1307,6 +1335,10 @@
"private": "https://drive.google.com/drive/folders/1ommnp45TuRWg2ouO0GUUJ_oUS7DC-zch",
"public": null
},
"hunting": {
"private": "https://drive.google.com/drive/folders/1nD3-5ww548AMWoCd1FhPSw2iHwCbKmab",
"public": ""
},
"iberia": {
"private": "https://drive.google.com/drive/folders/1S0EofwKOELxNOPc4hlwAJx1FJY0eiqyI",
"public": null
Expand Down Expand Up @@ -1463,6 +1495,10 @@
"private": "https://drive.google.com/drive/folders/1evst4qF-f3neCbaVDR2flnMP7s-VXE5R",
"public": ""
},
"islam-roots": {
"private": "https://drive.google.com/drive/folders/117vkalh_jjWfiCwUhYnZjJtB0Vv2miqw",
"public": ""
},
"israel": {
"private": "https://drive.google.com/drive/folders/1cFWd79GABWqKdvom184oXUy1lB5_BjzI",
"public": null
Expand Down Expand Up @@ -1495,6 +1531,10 @@
"private": "https://drive.google.com/drive/folders/1OzwQSNg_UkHqDJg2zbiAMNBe6TA7hrzz",
"public": ""
},
"japanese-christianity": {
"private": "https://drive.google.com/drive/folders/1vxmUP46OdmdUXxvbwLiij-8H7SLOCGGW",
"public": ""
},
"japanese-language": {
"private": "https://drive.google.com/drive/folders/1ygkMTpXtA7QAvHxQlxR9O-NhQ-Z8zddS",
"public": null
Expand Down Expand Up @@ -1607,6 +1647,10 @@
"private": "https://drive.google.com/drive/folders/1E1bZp33FntTdbDr9w0aXjLHTvxCgBrtQ",
"public": ""
},
"korea-culture": {
"private": "https://drive.google.com/drive/folders/1blAVLw20HufC3SpNWk6rrTuWXrAGqs09",
"public": ""
},
"korean": {
"private": "https://drive.google.com/drive/folders/1WpIJtgqt2uCebOadGfsBP57j2VNvCchO",
"public": "https://drive.google.com/drive/folders/1OzeiwDNx1UkXWNPAExN-i4R3uFGsjHe0"
Expand All @@ -1619,6 +1663,10 @@
"private": "https://drive.google.com/drive/folders/1enCmmZo18uyscJ7ySQR98fzsVqj7VVIX",
"public": ""
},
"korean-myth": {
"private": "https://drive.google.com/drive/folders/1i8t5mFddJ1uVC6ajxgKVgSUeG9tqWrj8",
"public": ""
},
"korean-roots": {
"private": "https://drive.google.com/drive/folders/19CzNT-c9pKJ_8doMCVfwE0MiVzYa9bcM",
"public": null
Expand Down Expand Up @@ -1735,6 +1783,10 @@
"private": "https://drive.google.com/drive/folders/1jvf1bMKFS5yUvDAfIn0nh1zfJrYFQ0Hc",
"public": ""
},
"macroeconomics": {
"private": "https://drive.google.com/drive/folders/1nAIbo7tuGm-PuDHpce4h2jV6diw4Syoe",
"public": ""
},
"madagascar": {
"private": "https://drive.google.com/drive/folders/1J2gvsgmfNjdW0aOZ1yUFdVFFlKZ_miWR",
"public": null
Expand Down Expand Up @@ -1879,6 +1931,10 @@
"private": "https://drive.google.com/drive/folders/1USrptmxKWj_lnnfTpROvShRXhK_j19FB",
"public": ""
},
"methodism": {
"private": "https://drive.google.com/drive/folders/18-BVYSIsuQkE1Yj2pyTMyM-ffpaPilB9",
"public": ""
},
"metta": {
"private": "https://drive.google.com/drive/folders/183emkjsBw56OLi12nHZW7fX0Etxp_oWN",
"public": ""
Expand Down Expand Up @@ -2071,6 +2127,10 @@
"private": "https://drive.google.com/drive/folders/1FuO7ihpLfOG7S0hJES3uMZe9UKM7XNSv",
"public": "https://drive.google.com/drive/folders/1Df4jtJCGwCjZM7qzpFjWPXINggf4sk2c"
},
"musicology": {
"private": "https://drive.google.com/drive/folders/1TXUrWlPpq_Z8NdEWIqlfe3Oo0JXwzg4l",
"public": ""
},
"mythology": {
"private": "https://drive.google.com/drive/folders/1jHq9v0Qf8RQATVB3mActZhpc3Zn2aMx8",
"public": ""
Expand Down Expand Up @@ -2551,6 +2611,10 @@
"private": "https://drive.google.com/drive/folders/1iuGIL9uhOxwWtid9NnjGFo1FZt6YyqbO",
"public": ""
},
"queensland": {
"private": "https://drive.google.com/drive/folders/1GpegShXPdzEFmliv6yPE87pwDm7wtA9S",
"public": ""
},
"queer-history": {
"private": "https://drive.google.com/drive/folders/1dI-cKvIhqM_4dQDjv8hKWey9w1iVHjsw",
"public": null
Expand Down Expand Up @@ -2675,6 +2739,10 @@
"private": "https://drive.google.com/drive/folders/1S9daHgcxLmecjYaqv7LhLpvrj4CE98be",
"public": ""
},
"sarvastivada": {
"private": "https://drive.google.com/drive/folders/1H14jLITPQsb0n-lai7xQJiTX4oOqkzW7",
"public": ""
},
"sati": {
"private": "https://drive.google.com/drive/folders/1RiZPGADpgW85IiBGbKMqxBUg5NZ0wr3s",
"public": "https://drive.google.com/drive/folders/1D0muqqLzCwOk6Mo0JJZkqA82-eOXWDUX"
Expand Down Expand Up @@ -2775,6 +2843,10 @@
"private": "https://drive.google.com/drive/folders/15crSHNvKJEUT9ZWKYwh_TDACKoNGAxeu",
"public": ""
},
"siberia": {
"private": "https://drive.google.com/drive/folders/1NAHnU1MDK3l1_URvyOkFBdj4Tzyd-lUq",
"public": ""
},
"silicon-valley": {
"private": "https://drive.google.com/drive/folders/1eZ-p05Kv_8RaLXJVG0pZg3nwHfLEfshu",
"public": null
Expand Down Expand Up @@ -3143,6 +3215,10 @@
"private": "https://drive.google.com/drive/folders/1K7z23KUqFD5hB-awjYIR2fUNX2gb-lGy",
"public": null
},
"the-bible": {
"private": "https://drive.google.com/drive/folders/112OZqzSkA_7xVUbNxox4JK5eS1BzsGmA",
"public": ""
},
"the-west": {
"private": "https://drive.google.com/drive/folders/15IiTFDxpoOrn9PydeJEI1niFbVTkZQP8",
"public": null
Expand Down Expand Up @@ -3179,6 +3255,10 @@
"private": "https://drive.google.com/drive/folders/1emgR7cTrCFISHFeAJb5eO21HsKp1Ma7O",
"public": null
},
"tibet-culture": {
"private": "https://drive.google.com/drive/folders/1BopUdVns1si2Qo9dh_KLvSxQtlOv3ywx",
"public": ""
},
"tibetan": {
"private": "https://drive.google.com/drive/folders/1DfpBxiTHKen3ZoQ2_Eg6gxxxW558Im0U",
"public": "https://drive.google.com/drive/folders/1nxdlT8-xZ5a0b63WxjTCPxwlDnXDSiLL"
Expand Down
10 changes: 5 additions & 5 deletions scripts/android_go_through.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def load_normalized_text_for_file(fp: Path, google_id: str) -> str:
from tqdm import tqdm
print(f"# Removing local duplicates...")
from collections import defaultdict
pbar = tqdm(local_files, unit="file")
pbar = tqdm(local_files, unit="f")
size_to_local_names = defaultdict(set)
for fp in pbar:
size_to_local_names[fp.stat().st_size].add(fp.name)
Expand Down Expand Up @@ -189,9 +189,9 @@ def process_local_file(fp: Path):
# fp.unlink()
# For now just move it out to be on the safe side...
fp.rename(fp.parent.joinpath('../../Download/').joinpath(fp.name))
tqdm_thread_map(process_local_file, local_files, max_workers=8, unit="file")
tqdm_thread_map(process_local_file, local_files, max_workers=8, unit="f")
print(f"# Ensuring all remote files are downloaded locally...")
children = tqdm(remote_children, unit="file")
children = tqdm(remote_children, unit="f")
for child in children:
if child['id'] in remote_ids_seen:
continue
Expand Down Expand Up @@ -227,7 +227,7 @@ def extract_text_from(fp):
if NORMALIZED_TEXT_FOLDER.joinpath(gid+'.pkl').exists():
return
load_normalized_text_for_file(fp, gid)
tqdm_thread_map(extract_text_from, local_files, max_workers=4, unit="file")
tqdm_thread_map(extract_text_from, local_files, max_workers=4, unit="f")
del remote_files_by_name
print("# Sorting PDFs into bulk import folders...")
children = gdrive.gcache.sql_query(
Expand All @@ -254,7 +254,7 @@ def sort_pdf_file(child):
[REMOTE_FOLDER],
verbose=False,
)
tqdm_thread_map(sort_pdf_file, children, max_workers=8, unit="file")
tqdm_thread_map(sort_pdf_file, children, max_workers=8, unit="f")
print("Done setting up local folder! Run again without --init to review files")
exit()

Expand Down
4 changes: 2 additions & 2 deletions scripts/clean_google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def remove_duplicate_files(verbose=True):
duplicate_md5s = gcache.find_duplicate_md5s()
print(f"[duplicates] Found {len(duplicate_md5s)} duplicated files by hash.")
if not verbose:
duplicate_md5s = tqdm(duplicate_md5s, unit='file', desc='Handling duplicates')
duplicate_md5s = tqdm(duplicate_md5s, unit='f', desc='Handling duplicates')
for md5 in duplicate_md5s:
remove_duplicate_file(md5, verbose=verbose, dry_run=False)
duplicate_urls = gcache.find_duplicate_urls()
Expand Down Expand Up @@ -376,7 +376,7 @@ def _get_target(pickle_file):
return gcache.get_item(pickle_file['name'][0:-4])
print("Checking pickle files...")
deletes = 0
pbar = tqdm(range(len(all_pickles)), unit="file")
pbar = tqdm(range(len(all_pickles)), unit="f")
for idx in pbar:
pickle = all_pickles[idx]
target = _get_target(pickle)
Expand Down
86 changes: 86 additions & 0 deletions scripts/coredownloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/python3

import json
import journals
from yaspin import yaspin
from tqdm import tqdm
from local_core import (
CoreAPIWorksCache,
)
import gdrive
import nearestpdf
import titlematch
import website
with yaspin(text="Loading website..."):
website.load()

nearestpdf.load()

TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268
# but there's a lot of mess here: other languages, review articles... how to filter?

# "Buddhist" articles that aren't for download
MDPI_PROVIDER_ID = 22080 # 351
ANTI_TRACKING_KEYWORDS = [
"1556-5068", # SSRN ISSN. 2 # registered as query 3
"NFTs", # 166
"blockchain", # 869 # registered as 4
"documentType:review", # 1040
]

core = CoreAPIWorksCache('/home/khbh/Desktop/core_api.db')

self_similarities = []
self_plus_similarities = []
differences = []
title_similarities = []

for website_item in website.content:
if website_item.formats[0] != 'pdf':
continue
if not website_item.get("drive_links"):
continue
drive_id = gdrive.link_to_id(website_item['drive_links'][0])
if drive_id not in nearestpdf.gid_to_idx:
continue
dois = [
doi.split('doi.org/')[1]
for doi in [
website_item.get('source_url', ''),
website_item.get('external_url', ''),
website_item.get('doi',''),
website_item.get('alternate_doi', ''),
website_item.get('alternative_doi', ''),
]
if 'doi.org/' in doi
]
core_work = None
for doi in dois:
core_work = core.get_locally_from_doi(doi)
if core_work:
break
if not core_work:
continue
if not core_work['full_text']:
continue
text_plus = f"{core_work['full_text']} {core_work['title']} {core_work['abstract'] or ''}"
authors = ''
try:
authors = [auth['name'] for auth in json.loads(core_work['authors'])]
except:
pass
matches = nearestpdf.find_matching_files(
core_work['title'],
authors,
text_plus,
)
drive_file = nearestpdf.google_files[nearestpdf.gid_to_idx[drive_id]]
if len(matches) == 1 and matches[0][0]['id'] == drive_id:
print("Got it with confidence", matches[0][1])
else:
import ipdb; ipdb.set_trace()





Loading