|
1 | 1 | #!/bin/python3 |
2 | 2 |
|
| 3 | +import json |
3 | 4 | import journals |
| 5 | +from yaspin import yaspin |
| 6 | +from tqdm import tqdm |
4 | 7 | from local_core import ( |
5 | 8 | CoreAPIWorksCache, |
6 | 9 | ) |
| 10 | +import gdrive |
| 11 | +import nearestpdf |
| 12 | +import titlematch |
| 13 | +import website |
| 14 | +with yaspin(text="Loading website..."): |
| 15 | + website.load() |
| 16 | + |
| 17 | +nearestpdf.load() |
7 | 18 |
|
8 | 19 | TRACKING_ISSNS = [issn for val in journals.issns.values() for issn in val] # 268 |
9 | 20 | # but there's a lot of mess here: other languages, review articles... how to filter? |
10 | 21 |
|
11 | | -import website |
12 | | -website.load() |
13 | | - |
14 | | -DOIS = [( |
15 | | - c.get('source_url', ''), |
16 | | - c.get('external_url', ''), |
17 | | - c.get('doi',''), |
18 | | - c.get('alternate_doi', ''), |
19 | | -) for c in website.content] |
20 | | -DOIS = [ |
21 | | - doi.split('doi.org/')[1] |
22 | | - for doilist in DOIS |
23 | | - for doi in doilist |
24 | | - if 'doi.org/' in doi |
25 | | -] |
26 | | -# 95 of the above have fullText |
27 | | - |
28 | 22 | # "Buddhist" articles that aren't for download |
29 | 23 | MDPI_PROVIDER_ID = 22080 # 351 |
30 | 24 | ANTI_TRACKING_KEYWORDS = [ |
31 | | - "1556-5068", # SSRN ISSN. 236 |
| 25 | + "1556-5068", # SSRN ISSN. 2 # registered as query 3 |
32 | 26 | "NFTs", # 166 |
33 | | - "blockchain", # 869 |
| 27 | + "blockchain", # 869 # registered as 4 |
34 | 28 | "documentType:review", # 1040 |
35 | 29 | ] |
36 | 30 |
|
| 31 | +core = CoreAPIWorksCache('/home/khbh/Desktop/core_api.db') |
| 32 | + |
| 33 | +self_similarities = [] |
| 34 | +self_plus_similarities = [] |
| 35 | +differences = [] |
| 36 | +title_similarities = [] |
| 37 | + |
| 38 | +for website_item in website.content: |
| 39 | + if website_item.formats[0] != 'pdf': |
| 40 | + continue |
| 41 | + if not website_item.get("drive_links"): |
| 42 | + continue |
| 43 | + drive_id = gdrive.link_to_id(website_item['drive_links'][0]) |
| 44 | + if drive_id not in nearestpdf.gid_to_idx: |
| 45 | + continue |
| 46 | + dois = [ |
| 47 | + doi.split('doi.org/')[1] |
| 48 | + for doi in [ |
| 49 | + website_item.get('source_url', ''), |
| 50 | + website_item.get('external_url', ''), |
| 51 | + website_item.get('doi',''), |
| 52 | + website_item.get('alternate_doi', ''), |
| 53 | + website_item.get('alternative_doi', ''), |
| 54 | + ] |
| 55 | + if 'doi.org/' in doi |
| 56 | + ] |
| 57 | + core_work = None |
| 58 | + for doi in dois: |
| 59 | + core_work = core.get_locally_from_doi(doi) |
| 60 | + if core_work: |
| 61 | + break |
| 62 | + if not core_work: |
| 63 | + continue |
| 64 | + if not core_work['full_text']: |
| 65 | + continue |
| 66 | + text_plus = f"{core_work['full_text']} {core_work['title']} {core_work['abstract'] or ''}" |
| 67 | + authors = '' |
| 68 | + try: |
| 69 | + authors = [auth['name'] for auth in json.loads(core_work['authors'])] |
| 70 | + except: |
| 71 | + pass |
| 72 | + matches = nearestpdf.find_matching_files( |
| 73 | + core_work['title'], |
| 74 | + authors, |
| 75 | + text_plus, |
| 76 | + ) |
| 77 | + drive_file = nearestpdf.google_files[nearestpdf.gid_to_idx[drive_id]] |
| 78 | + if len(matches) == 1 and matches[0][0]['id'] == drive_id: |
| 79 | + print("Got it with confidence", matches[0][1]) |
| 80 | + else: |
| 81 | + import ipdb; ipdb.set_trace() |
| 82 | + |
| 83 | + |
37 | 84 |
|
38 | 85 |
|
39 | 86 |
|
0 commit comments