Skip to content

Commit 151ae04

Browse files
committed
Add python script for refreshing the Drive Catalog
1 parent d7f6214 commit 151ae04

5 files changed

Lines changed: 201 additions & 10 deletions

File tree

‎.github/workflows/archive.yml‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ubuntu-latest
1212
steps:
1313
- name: Checkout the Code
14-
uses: actions/checkout@v3
14+
uses: actions/checkout@v4
1515
with:
1616
ref: main
1717
- name: Install Dependencies

‎.github/workflows/catalogue.yml‎

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: GDrive Cataloguer
2+
on:
3+
workflow_dispatch:
4+
schedule:
5+
- cron: "50 0 24 * *"
6+
jobs:
7+
gdrive:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Checkout the Code
11+
uses: actions/checkout@v4
12+
- name: Install Dependencies
13+
run: |
14+
pip install titlecase pyyaml pypdf tqdm yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api
15+
printf '${{ secrets.GTOKEN }}' > ~/gtoken.json
16+
printf '${{ secrets.LIBRARY_UTILS_CLIENT_SECRET }}' > ~/library-utils-client-secret.json
17+
printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > ~/archive.org.auth
18+
- name: Run Cataloguer
19+
run: python scripts/refresh_catalog_doc.py

‎_layouts/default.html‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@
8484
</head>
8585
<body>
8686
<a class="skip-to-main" href="#main">Skip to content</a>
87+
<noscript>
88+
<img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
89+
</noscript>
8790

8891
<header class="site-header{% if page.banner_info %} banner-img{% if page.next_courses %} fullsize{% elsif site.header_pages contains page.path %} smallsize{% endif %}{% if page.image_center_y %}" style="background-position-y: {{ page.image_center_y }};{% if page.image_center_x %} background-position-x: {{ page.image_center_x }};{% endif %}{% endif %}{% endif %}" role="banner">
8992
<div class="wrapper"{% if page.banner_info %} style="backdrop-filter: blur(0.3px);"{% endif %}>
@@ -145,9 +148,6 @@ <h2 class="footer-heading">{{ site.title | escape }}</h2>
145148
data-goatcounter-settings='{"allow_frame": true, "title": "{{ page.title | replace: '"', '\\"' }}"}'
146149
async src="//gc.zgo.at/count.js">
147150
</script>
148-
<noscript>
149-
<img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
150-
</noscript>
151151
<script async src="/assets/js/goatclicker.js">
152152
</script>
153153
{%- endif -%}

‎scripts/gdrive.py‎

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os.path
44
from pathlib import Path
55
import requests
6-
import struct
6+
import socket
77
from math import floor
88
from io import BytesIO, BufferedIOBase
99
from strutils import (
@@ -21,7 +21,6 @@
2121
import pdfutils
2222
import json
2323
import re
24-
import shutil
2524
from functools import cache
2625
from archivedotorg import archive_urls
2726
try:
@@ -137,12 +136,17 @@ def google_credentials():
137136

138137
@cache
139138
def session():
139+
socket.setdefaulttimeout(300) # some of our uploads take a while...
140140
return build('drive', 'v3', credentials=google_credentials())
141141

142142
@cache
143143
def youtube():
144144
return build('youtube', 'v3', credentials=google_credentials())
145145

146+
@cache
147+
def docs():
148+
return build('docs', 'v1', credentials=google_credentials()).documents()
149+
146150
def get_ytvideo_snippets(ytids):
147151
snippets = []
148152
if len(ytids) > 50:
@@ -204,7 +208,7 @@ def string_to_media(s, mimeType):
204208
resumable=True,
205209
)
206210

207-
def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None):
211+
def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None, replace_doc=False):
208212
if bool(html) == bool(rtf):
209213
raise ValueError("Please specify either rtf OR html.")
210214
drive_service = session()
@@ -224,7 +228,7 @@ def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None,
224228
media = string_to_media(html, 'text/html')
225229
if rtf:
226230
media = string_to_media(rtf, 'application/rtf')
227-
return _perform_upload(metadata, media, verbose=False)
231+
return _perform_upload(metadata, media, verbose=False, update_file=replace_doc)
228232

229233
def get_file_contents(fileid, verbose=True):
230234
"""Downloads and returns the contents of fileid in a BytesIO buffer"""
@@ -266,11 +270,15 @@ def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=Non
266270
media = MediaFileUpload(file_path, resumable=True)
267271
return _perform_upload(file_metadata, media, verbose=verbose)
268272

269-
def _perform_upload(file_metadata, media, verbose=True):
273+
def _perform_upload(file_metadata, media, verbose=True, update_file=False):
270274
drive_service = session()
271275
try:
272276
# Upload the file
273-
request = drive_service.files().create(body=file_metadata, media_body=media)
277+
request = None
278+
if update_file:
279+
request = drive_service.files().update(fileId=update_file, body=file_metadata, media_body=media)
280+
else:
281+
request = drive_service.files().create(body=file_metadata, media_body=media)
274282
response = None
275283
while response is None:
276284
status, response = request.next_chunk()

‎scripts/refresh_catalog_doc.py‎

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#!/bin/python
2+
3+
import gdrive
4+
import datetime
5+
from collections import defaultdict
6+
from functools import cache
7+
import hashlib
8+
9+
ROOT_FOLDER = "1RJi6bEXa25zizGdsm5evCycYuY6a2D8r"
10+
FIELDS = "id,name,mimeType,size,shortcutDetails,createdTime,webViewLink"
11+
MY_EMAILS = {
12+
'aee5188bd988b0ab263a6b3003831c6e',
13+
'e55371a7e1b97300ea623338dbcc0694',
14+
'3945098d73ac3a594febd2c87d357971',
15+
'3b654b6ccfb53f233fbd798415b62624',
16+
'b9083baac482b28ac374ebe1856bfefc',
17+
'7f519cc091d7690b440aa4db74141a94',
18+
'd97d9501979b0a1442b0482418509a84',
19+
}
20+
21+
TD = 'td style="padding:5pt;"'
22+
23+
def md5(text):
24+
return hashlib.md5(text.encode()).hexdigest()
25+
26+
def human_readable_size(bytes_size):
27+
units = ["B", "KB", "MB", "GB", "TB", "PB"]
28+
if bytes_size == 0:
29+
return "0 B"
30+
index = 0
31+
size = float(bytes_size)
32+
while size >= 922 and index < len(units) - 1:
33+
size /= 1024
34+
index += 1
35+
return f"{size:.2f} {units[index]}"
36+
37+
def headerize(text, depth):
38+
if depth < 1:
39+
return ''
40+
if depth <= 4:
41+
return f"<h{depth}>{text}</h{depth}>"
42+
if depth <= 6:
43+
return f'<h{depth} style="font-size=11pt;">{text}</h{depth}>'
44+
space = "&nbsp;&nbsp;"*(depth-1)
45+
return f"<p>{space}+ {text}</p>"
46+
47+
seen_folders = set()
48+
49+
class DriveFolder:
50+
def __init__(self, name: str, folderid: str, createdTime: str, depth: int) -> None:
51+
if folderid in seen_folders:
52+
raise ValueError(f"Folder already seen: {folderid}")
53+
seen_folders.add(folderid)
54+
print(f"Loading folder \"{name}\"...")
55+
self.name = name
56+
self.id = id
57+
self.createdTime = createdTime
58+
self.depth = depth
59+
self.files = []
60+
subfolders = []
61+
shortcuts = []
62+
query = f"trashed=false AND '{folderid}' in parents"
63+
for child in gdrive.all_files_matching(query, FIELDS):
64+
if child['mimeType'] == 'application/vnd.google-apps.folder':
65+
subfolders.append(child)
66+
continue
67+
if child['mimeType'] == 'application/vnd.google-apps.shortcut':
68+
shortcuts.append(child)
69+
continue
70+
child['size'] = int(child.get('size', 0))
71+
self.files.append(child)
72+
if len(shortcuts) > 0:
73+
print(f" Resolving {len(shortcuts)} shortcut(s)...")
74+
for child in gdrive.batch_get_files_by_id(
75+
[c['shortcutDetails']['targetId'] for c in shortcuts],
76+
FIELDS+',owners'
77+
):
78+
shortcut = [s for s in shortcuts if s['shortcutDetails']['targetId'] == child['id']][0]
79+
owner = child['owners'][0]
80+
if md5(owner['emailAddress']) in MY_EMAILS:
81+
print(f" Skipping {shortcut['name']}->{child['name']} because it's owned by me")
82+
continue
83+
child['originalName'] = child['name']
84+
child['originalCreatedTime'] = child['createdTime']
85+
child['name'] = shortcut['name']
86+
child['createdTime'] = shortcut['createdTime']
87+
if child['mimeType'] == 'application/vnd.google-apps.folder':
88+
subfolders.append(child)
89+
else:
90+
child['size'] = int(child.get('size', 0))
91+
self.files.append(child)
92+
print(f" Got {len(self.files)} files and {len(subfolders)} subfolders")
93+
self.subfolders = []
94+
for child in subfolders:
95+
self.subfolders.append(DriveFolder(
96+
child['name'],
97+
child['id'],
98+
child['createdTime'],
99+
self.depth + 1,
100+
))
101+
self.files = sorted(self.files, key=lambda f: f['createdTime'])
102+
self.subfolders = sorted(self.subfolders, key=lambda f: f.createdTime)
103+
104+
@cache
105+
def total_size(self):
106+
return sum(f['size'] for f in self.files) + sum(f.total_size() for f in self.subfolders)
107+
108+
@cache
109+
def total_count(self):
110+
return sum(f.total_count() for f in self.subfolders) + len(self.files)
111+
112+
def file_count_by_mimetype(self):
113+
ret = defaultdict(lambda: {'size': 0, 'count': 0})
114+
for t in set([g['mimeType'] for g in self.files]):
115+
fs = [f for f in self.files if f['mimeType'] == t]
116+
ret[t] = {'size': sum(f['size'] for f in fs), 'count': len(fs)}
117+
for child in self.subfolders:
118+
subcounts = child.file_count_by_mimetype()
119+
for t in subcounts:
120+
ret[t]['count'] += subcounts[t]['count']
121+
ret[t]['size'] += subcounts[t]['size']
122+
return ret
123+
124+
def list_files(self):
125+
space = '&nbsp;&nbsp;'*self.depth
126+
ret = [headerize(
127+
f'<a href="{gdrive.FOLDER_LINK_PREFIX}{self.id}">{self.name}</a> <span style="color:#666666;">({human_readable_size(self.total_size())})</span>',
128+
self.depth,
129+
)]
130+
for child in self.files:
131+
ret.append(f"""<p>{space}- <a href="{child['webViewLink']}">{child['name']}</a></p>""")
132+
for child in self.subfolders:
133+
ret.append(child.list_files())
134+
return '\n'.join(ret)
135+
136+
if __name__ == "__main__":
137+
138+
root = DriveFolder("A Curated Buddhist G-Library", ROOT_FOLDER, "2019-01-01T00:00:00Z", 0)
139+
total_size = human_readable_size(root.total_size())
140+
total_count = root.total_count()
141+
print("\n==================\nFinished fetching data!\n==================\n")
142+
143+
html = f"""<html>
144+
<head><meta content="text/html; charset=UTF-8"></head>
145+
<body class="doc-content">
146+
<p class="title" style="font-size:26pt;padding-bottom:3pt;line-height:1.15;page-break-after:avoid;font-family:&quot;Arial&quot;;orphans:2;widows:2;text-align:left;"><span style="font-weight:400;text-decoration:none;vertical-align:baseline;font-size:26pt;font-family:&quot;Arial&quot;;font-style:normal">Buddhist G-Library Catalog</span></p>
147+
<p>An automatically generated list of all the files in the Library.</p>
148+
<p>Generated on {datetime.datetime.now(datetime.timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT")}</p><p></p>
149+
<p>In total, the library is {total_size} large and contains {total_count} files. They break down by MIME type as follows:</p>
150+
<table><tr style="text-decoration:underline;"><{TD}>MIME Type</td><{TD}>Count</td><{TD}>Size</td></tr>
151+
{"".join(f"<tr><{TD}>{t}</td><{TD}>{c['count']}</td><{TD}>{human_readable_size(c['size'])}</td></tr>" for t, c in root.file_count_by_mimetype().items())}
152+
<tr style="font-weight:700;"><{TD}>Total</td><{TD}>{total_count}</td><{TD}>{total_size}</td></tr>
153+
</table><p></p><h1>Files</h1><p></p>{root.list_files()}
154+
</body>
155+
</html>
156+
"""
157+
158+
print("Replacing public doc with new version...")
159+
docid = gdrive.create_doc(
160+
html=html,
161+
creator="CatalogBuilder",
162+
replace_doc="1rGLm9Xh5de0e3hsMY2yyt97MWBuZJ1V1_q0jhGe7vpw",
163+
)
164+
print(f"Done! See https://docs.google.com/document/d/{docid}/edit")

0 commit comments

Comments
 (0)