Skip to content

Commit bc38b0e

Browse files
committed
Fix Search Console stale URL redirects
1 parent 0893c25 commit bc38b0e

6 files changed

Lines changed: 1409 additions & 50 deletions
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env node
2+
"use strict";
3+
4+
const fs = require("fs");
5+
const path = require("path");
6+
const zlib = require("zlib");
7+
8+
const { shouldDropSitemapPath } = require("./postbuild-seo.js");
9+
10+
const siteDir = path.resolve(__dirname, "..");
11+
const buildDir = path.join(siteDir, "build");
12+
const redirectsPath = path.join(buildDir, "_redirects");
13+
const sitemapPath = path.join(buildDir, "sitemap.xml");
14+
const siteOrigin = "https://api-docs.quran.foundation";
15+
const siteHost = new URL(siteOrigin).host;
16+
17+
function decodeXml(value) {
18+
return String(value || "")
19+
.replace(/"/g, '"')
20+
.replace(/'/g, "'")
21+
.replace(/&lt;/g, "<")
22+
.replace(/&gt;/g, ">")
23+
.replace(/&amp;/g, "&");
24+
}
25+
26+
function getAttr(attrs, name) {
27+
const match = attrs.match(new RegExp(`\\b${name}="([^"]*)"`, "i"));
28+
return match ? decodeXml(match[1]) : null;
29+
}
30+
31+
function findEndOfCentralDirectory(buffer) {
32+
for (let offset = buffer.length - 22; offset >= 0; offset -= 1) {
33+
if (buffer.readUInt32LE(offset) === 0x06054b50) {
34+
return {
35+
entries: buffer.readUInt16LE(offset + 10),
36+
centralDirectoryOffset: buffer.readUInt32LE(offset + 16),
37+
};
38+
}
39+
}
40+
41+
throw new Error("Could not find XLSX central directory");
42+
}
43+
44+
function readZipEntries(filePath) {
45+
const buffer = fs.readFileSync(filePath);
46+
const eocd = findEndOfCentralDirectory(buffer);
47+
const entries = new Map();
48+
let offset = eocd.centralDirectoryOffset;
49+
50+
for (let index = 0; index < eocd.entries; index += 1) {
51+
if (buffer.readUInt32LE(offset) !== 0x02014b50) {
52+
throw new Error(`Invalid ZIP central directory in ${filePath}`);
53+
}
54+
55+
const method = buffer.readUInt16LE(offset + 10);
56+
const compressedSize = buffer.readUInt32LE(offset + 20);
57+
const fileNameLength = buffer.readUInt16LE(offset + 28);
58+
const extraLength = buffer.readUInt16LE(offset + 30);
59+
const commentLength = buffer.readUInt16LE(offset + 32);
60+
const localHeaderOffset = buffer.readUInt32LE(offset + 42);
61+
const name = buffer.toString("utf8", offset + 46, offset + 46 + fileNameLength);
62+
63+
if (buffer.readUInt32LE(localHeaderOffset) !== 0x04034b50) {
64+
throw new Error(`Invalid ZIP local header for ${name}`);
65+
}
66+
67+
const localFileNameLength = buffer.readUInt16LE(localHeaderOffset + 26);
68+
const localExtraLength = buffer.readUInt16LE(localHeaderOffset + 28);
69+
const dataStart = localHeaderOffset + 30 + localFileNameLength + localExtraLength;
70+
const compressed = buffer.subarray(dataStart, dataStart + compressedSize);
71+
const data =
72+
method === 0
73+
? compressed
74+
: method === 8
75+
? zlib.inflateRawSync(compressed)
76+
: null;
77+
78+
if (!data) {
79+
throw new Error(`Unsupported ZIP compression method ${method} for ${name}`);
80+
}
81+
82+
entries.set(name, data.toString("utf8"));
83+
offset += 46 + fileNameLength + extraLength + commentLength;
84+
}
85+
86+
return entries;
87+
}
88+
89+
function readSharedStrings(entries) {
90+
const xml = entries.get("xl/sharedStrings.xml");
91+
if (!xml) {
92+
return [];
93+
}
94+
95+
return [...xml.matchAll(/<si\b[^>]*>([\s\S]*?)<\/si>/g)].map(([, si]) =>
96+
decodeXml(
97+
[...si.matchAll(/<t\b[^>]*>([\s\S]*?)<\/t>/g)]
98+
.map(([, text]) => text)
99+
.join(""),
100+
),
101+
);
102+
}
103+
104+
function getWorkbookSheetPath(entries, sheetName) {
105+
const workbook = entries.get("xl/workbook.xml");
106+
const rels = entries.get("xl/_rels/workbook.xml.rels");
107+
if (!workbook || !rels) {
108+
throw new Error("XLSX workbook metadata is missing");
109+
}
110+
111+
const sheet = [...workbook.matchAll(/<sheet\b([^>]*)\/>/g)]
112+
.map(([, attrs]) => ({
113+
name: getAttr(attrs, "name"),
114+
relId: getAttr(attrs, "r:id"),
115+
}))
116+
.find((item) => item.name === sheetName);
117+
118+
if (!sheet) {
119+
throw new Error(`XLSX sheet not found: ${sheetName}`);
120+
}
121+
122+
const rel = [...rels.matchAll(/<Relationship\b([^>]*)\/>/g)]
123+
.map(([, attrs]) => ({
124+
id: getAttr(attrs, "Id"),
125+
target: getAttr(attrs, "Target"),
126+
}))
127+
.find((item) => item.id === sheet.relId);
128+
129+
if (!rel) {
130+
throw new Error(`XLSX sheet relationship not found: ${sheet.relId}`);
131+
}
132+
133+
return `xl/${rel.target.replace(/^\//, "")}`;
134+
}
135+
136+
function columnIndex(cellRef) {
137+
const letters = String(cellRef || "A").match(/^[A-Z]+/i)?.[0] || "A";
138+
return [...letters.toUpperCase()].reduce(
139+
(value, letter) => value * 26 + letter.charCodeAt(0) - 64,
140+
0,
141+
) - 1;
142+
}
143+
144+
function readWorksheetRows(entries, sheetName) {
145+
const sharedStrings = readSharedStrings(entries);
146+
const sheetPath = getWorkbookSheetPath(entries, sheetName);
147+
const xml = entries.get(sheetPath);
148+
if (!xml) {
149+
throw new Error(`XLSX worksheet payload missing: ${sheetPath}`);
150+
}
151+
152+
return [...xml.matchAll(/<row\b[^>]*>([\s\S]*?)<\/row>/g)].map(([, rowXml]) => {
153+
const row = [];
154+
155+
for (const [, attrs, cellXml] of rowXml.matchAll(/<c\b([^>]*)>([\s\S]*?)<\/c>/g)) {
156+
const index = columnIndex(getAttr(attrs, "r"));
157+
const type = getAttr(attrs, "t");
158+
const value = cellXml.match(/<v>([\s\S]*?)<\/v>/)?.[1] || "";
159+
160+
if (type === "s") {
161+
row[index] = sharedStrings[Number(value)] || "";
162+
} else if (type === "inlineStr") {
163+
row[index] = decodeXml(
164+
[...cellXml.matchAll(/<t\b[^>]*>([\s\S]*?)<\/t>/g)]
165+
.map(([, text]) => text)
166+
.join(""),
167+
);
168+
} else {
169+
row[index] = decodeXml(value);
170+
}
171+
}
172+
173+
return row;
174+
});
175+
}
176+
177+
function readSearchConsoleExport(filePath) {
178+
const entries = readZipEntries(filePath);
179+
const tableRows = readWorksheetRows(entries, "Table");
180+
const metadataRows = readWorksheetRows(entries, "Metadata");
181+
const urlIndex = tableRows[0]?.findIndex((value) => value === "URL");
182+
const issue =
183+
metadataRows.find((row) => row[0] === "Issue")?.[1] ||
184+
path.basename(filePath);
185+
186+
if (urlIndex == null || urlIndex < 0) {
187+
throw new Error(`Could not find URL column in ${filePath}`);
188+
}
189+
190+
return {
191+
issue,
192+
urls: tableRows.slice(1).map((row) => row[urlIndex]).filter(Boolean),
193+
};
194+
}
195+
196+
function readRedirectSources() {
197+
if (!fs.existsSync(redirectsPath)) {
198+
return new Map();
199+
}
200+
201+
const redirects = new Map();
202+
for (const line of fs.readFileSync(redirectsPath, "utf8").split(/\r?\n/)) {
203+
const trimmed = line.trim();
204+
if (!trimmed || trimmed.startsWith("#")) {
205+
continue;
206+
}
207+
208+
const [source, target, status] = trimmed.split(/\s+/);
209+
redirects.set(source, { target, status });
210+
}
211+
212+
return redirects;
213+
}
214+
215+
function readSitemapPaths() {
216+
if (!fs.existsSync(sitemapPath)) {
217+
return new Set();
218+
}
219+
220+
const sitemap = fs.readFileSync(sitemapPath, "utf8");
221+
return new Set(
222+
[...sitemap.matchAll(/<loc>([^<]+)<\/loc>/g)].map(
223+
([, rawUrl]) => new URL(rawUrl).pathname,
224+
),
225+
);
226+
}
227+
228+
function buildPathExists(pathname) {
229+
const relativePath = pathname.replace(/^\//, "");
230+
const candidate = pathname.endsWith("/")
231+
? path.join(buildDir, relativePath, "index.html")
232+
: path.extname(pathname)
233+
? path.join(buildDir, relativePath)
234+
: path.join(buildDir, relativePath, "index.html");
235+
236+
return fs.existsSync(candidate);
237+
}
238+
239+
function pathnameFromUrl(rawUrl) {
240+
const url = new URL(rawUrl);
241+
if (url.host !== siteHost || !["http:", "https:"].includes(url.protocol)) {
242+
throw new Error(`Unexpected Search Console origin: ${rawUrl}`);
243+
}
244+
245+
return url.pathname;
246+
}
247+
248+
function classify(pathname, issue, redirects, sitemapPaths) {
249+
if (redirects.has(pathname)) {
250+
return "redirected";
251+
}
252+
253+
if (/\/auth-[a-z0-9-]+\/?$/i.test(pathname)) {
254+
return issue === "Not found (404)" ? "unresolved-404" : "manual-review";
255+
}
256+
257+
if (issue === "Not found (404)") {
258+
return "unresolved-404";
259+
}
260+
261+
if (shouldDropSitemapPath(pathname)) {
262+
return "canonicalized-alternate";
263+
}
264+
265+
if (sitemapPaths.has(pathname)) {
266+
return "valid-self-canonical";
267+
}
268+
269+
if (buildPathExists(pathname)) {
270+
return "valid-static-resource";
271+
}
272+
273+
return "manual-review";
274+
}
275+
276+
function main() {
277+
const files = process.argv.slice(2);
278+
if (files.length === 0) {
279+
throw new Error("Usage: node scripts/audit-search-console-coverage.js <export.xlsx> [...]");
280+
}
281+
282+
const redirects = readRedirectSources();
283+
const sitemapPaths = readSitemapPaths();
284+
let unresolvedCount = 0;
285+
286+
for (const filePath of files) {
287+
const { issue, urls } = readSearchConsoleExport(filePath);
288+
const counts = new Map();
289+
const examples = [];
290+
291+
for (const rawUrl of urls) {
292+
const pathname = pathnameFromUrl(rawUrl);
293+
const classification = classify(pathname, issue, redirects, sitemapPaths);
294+
counts.set(classification, (counts.get(classification) || 0) + 1);
295+
if (
296+
(classification === "unresolved-404" || classification === "manual-review") &&
297+
examples.length < 8
298+
) {
299+
examples.push(pathname);
300+
}
301+
}
302+
303+
unresolvedCount += counts.get("unresolved-404") || 0;
304+
305+
console.log(
306+
JSON.stringify(
307+
{
308+
file: path.basename(filePath),
309+
issue,
310+
total: urls.length,
311+
counts: Object.fromEntries([...counts.entries()].sort()),
312+
examples,
313+
},
314+
null,
315+
2,
316+
),
317+
);
318+
}
319+
320+
if (unresolvedCount > 0) {
321+
process.exitCode = 1;
322+
}
323+
}
324+
325+
if (require.main === module) {
326+
main();
327+
}

0 commit comments

Comments
 (0)