Skip to content

Commit 1ba79ff

Browse files
committed
Finalize the nearestpdf finder
[skip ci]
1 parent 4b27060 commit 1ba79ff

4 files changed

Lines changed: 217 additions & 50 deletions

File tree

‎scripts/coredownloader.py‎

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,86 @@
11
#!/bin/python3
22

3+
import json
34
import journals
5+
from yaspin import yaspin
6+
from tqdm import tqdm
47
from local_core import (
58
CoreAPIWorksCache,
69
)
10+
import gdrive
11+
import nearestpdf
12+
import titlematch
13+
import website
14+
with yaspin(text="Loading website..."):
15+
website.load()
16+
17+
nearestpdf.load()
718

819
TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268
920
# but there's a lot of mess here: other languages, review articles... how to filter?
1021

11-
import website
12-
website.load()
13-
14-
DOIS = [(
15-
c.get('source_url', ''),
16-
c.get('external_url', ''),
17-
c.get('doi',''),
18-
c.get('alternate_doi', ''),
19-
) for c in website.content]
20-
DOIS = [
21-
doi.split('doi.org/')[1]
22-
for doilist in DOIS
23-
for doi in doilist
24-
if 'doi.org/' in doi
25-
]
26-
# 95 of the above have fullText
27-
2822
# "Buddhist" articles that aren't for download
2923
MDPI_PROVIDER_ID = 22080 # 351
3024
ANTI_TRACKING_KEYWORDS = [
31-
"1556-5068", # SSRN ISSN. 236
25+
"1556-5068", # SSRN ISSN. 2 # registered as query 3
3226
"NFTs", # 166
33-
"blockchain", # 869
27+
"blockchain", # 869 # registered as 4
3428
"documentType:review", # 1040
3529
]
3630

31+
core = CoreAPIWorksCache('/home/khbh/Desktop/core_api.db')
32+
33+
self_similarities = []
34+
self_plus_similarities = []
35+
differences = []
36+
title_similarities = []
37+
38+
for website_item in website.content:
39+
if website_item.formats[0] != 'pdf':
40+
continue
41+
if not website_item.get("drive_links"):
42+
continue
43+
drive_id = gdrive.link_to_id(website_item['drive_links'][0])
44+
if drive_id not in nearestpdf.gid_to_idx:
45+
continue
46+
dois = [
47+
doi.split('doi.org/')[1]
48+
for doi in [
49+
website_item.get('source_url', ''),
50+
website_item.get('external_url', ''),
51+
website_item.get('doi',''),
52+
website_item.get('alternate_doi', ''),
53+
website_item.get('alternative_doi', ''),
54+
]
55+
if 'doi.org/' in doi
56+
]
57+
core_work = None
58+
for doi in dois:
59+
core_work = core.get_locally_from_doi(doi)
60+
if core_work:
61+
break
62+
if not core_work:
63+
continue
64+
if not core_work['full_text']:
65+
continue
66+
text_plus = f"{core_work['full_text']} {core_work['title']} {core_work['abstract'] or ''}"
67+
authors = ''
68+
try:
69+
authors = [auth['name'] for auth in json.loads(core_work['authors'])]
70+
except:
71+
pass
72+
matches = nearestpdf.find_matching_files(
73+
core_work['title'],
74+
authors,
75+
text_plus,
76+
)
77+
drive_file = nearestpdf.google_files[nearestpdf.gid_to_idx[drive_id]]
78+
if len(matches) == 1 and matches[0][0]['id'] == drive_id:
79+
print("Got it with confidence", matches[0][1])
80+
else:
81+
import ipdb; ipdb.set_trace()
82+
83+
3784

3885

3986

‎scripts/local_core.py‎

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -400,8 +400,8 @@ def load_another_page_from_query(self, query_id: int) -> int:
400400
return ret
401401

402402
@locked
403-
def _get_local_by_doi(self, doi: str) -> dict | None:
404-
"""Helper to find the work with the given DOI in the local database"""
403+
def get_locally_from_doi(self, doi: str) -> dict | None:
404+
"""If we have the work for this doi locally already, returns it, else None"""
405405
self.cursor.execute(
406406
"SELECT works.* FROM works JOIN identifiers ON works.id = identifiers.work_id WHERE identifiers.id = ? AND identifiers.id_type = 'DOI'",
407407
(doi, )
@@ -427,7 +427,6 @@ def _get_local_by_doi(self, doi: str) -> dict | None:
427427
# use a combination of citations and english score to pick one
428428
# Using lower ids as the tie breaker
429429
if len(ret) > 1:
430-
print(f"WARNING: Found {len(ret)} works for DOI:{doi}")
431430
ret.sort(key=lambda r: r['id'])
432431
ret.sort(
433432
key=lambda r: 2*(r['en_confidence'] or 0)+(r['citation_count'] or 0),
@@ -457,7 +456,7 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
457456
now = current_timestamp()
458457

459458
for i, doi in enumerate(dois):
460-
work = self._get_local_by_doi(doi)
459+
work = self.get_locally_from_doi(doi)
461460
if work:
462461
results[i] = work
463462
if verbose:
@@ -504,7 +503,7 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
504503

505504
# For each DOI in our batch, check if we found it (via the DB lookup)
506505
for doi in batch:
507-
work = self._get_local_by_doi(doi)
506+
work = self.get_locally_from_doi(doi)
508507
if work:
509508
for idx in to_fetch[doi]:
510509
results[idx] = work
@@ -524,6 +523,14 @@ def bulk_get_by_doi(self, dois: list[str], max_per_batch: int = 0, verbose: bool
524523

525524
return results
526525

526+
@locked
527+
def get_local_works_for_query(self, query_id: int) -> list[dict]:
528+
self.cursor.execute(
529+
"SELECT works.* FROM works JOIN query_works ON query_works.work_id = works.id WHERE query_works.query_id = ?",
530+
(query_id, )
531+
)
532+
return [dict(row) for row in self.cursor.fetchall()]
533+
527534
@locked
528535
def close(self):
529536
if self.conn:

0 commit comments

Comments
 (0)