diff --git a/package-lock.json b/package-lock.json index 005c376..caf5217 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "google-indexing-script", - "version": "0.3.0", + "version": "0.4.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "google-indexing-script", - "version": "0.3.0", + "version": "0.4.0", "license": "MIT", "dependencies": { "commander": "^12.1.0", diff --git a/src/index.ts b/src/index.ts index e764d97..a51a43d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,39 @@ import { readFileSync, existsSync, mkdirSync, writeFileSync } from "fs"; import path from "path"; const CACHE_TIMEOUT = 1000 * 60 * 60 * 24 * 14; // 14 days +const SHORT_TIMEOUT = 1000 * 60 * 60; // 1 hour + +const indexableStatuses = [ + Status.SubmittedAndIndexed, + Status.CrawledCurrentlyNotIndexed, + Status.DiscoveredCurrentlyNotIndexed, + Status.Forbidden, + Status.Error, + Status.RateLimited, +]; + +const quickFixStatuses = [Status.NotFound, Status.PageWithRedirect]; + +const shouldRecheck = (status: Status, lastCheckedAt: string) => { + const timeSinceLastCheck = Date.now() - new Date(lastCheckedAt).getTime(); + + if (indexableStatuses.includes(status)) { + if (status === Status.SubmittedAndIndexed) { + return timeSinceLastCheck > CACHE_TIMEOUT; + } + // For other indexable statuses, check more frequently + return timeSinceLastCheck > SHORT_TIMEOUT; + } + + if (quickFixStatuses.includes(status)) { + // For statuses that might be quickly fixed, use a shorter timeout + return timeSinceLastCheck > SHORT_TIMEOUT; + } + + // For any other status, use the standard cache timeout + return timeSinceLastCheck > CACHE_TIMEOUT; +}; + export const QUOTA = { rpm: { retries: 3, @@ -108,22 +141,26 @@ export const index = async (input: string = process.argv[2], options: IndexOptio [Status.RateLimited]: [], [Status.Forbidden]: [], [Status.Error]: [], + [Status.NotFound]: [], }; const indexableStatuses = [ - Status.DiscoveredCurrentlyNotIndexed, + Status.SubmittedAndIndexed, Status.CrawledCurrentlyNotIndexed, + Status.DiscoveredCurrentlyNotIndexed, Status.URLIsUnknownToGoogle, Status.Forbidden, Status.Error, Status.RateLimited, + Status.NotFound, ]; - const shouldRecheck = (status: Status, lastCheckedAt: string) => { - const shouldIndexIt = indexableStatuses.includes(status); - const isOld = new Date(lastCheckedAt) < new Date(Date.now() - CACHE_TIMEOUT); - return shouldIndexIt && isOld; - }; + const urlsToProcess = pages.filter((url) => { + const result = statusPerUrl[url]; + return !result || shouldRecheck(result.status, result.lastCheckedAt); + }); + + console.log(`👉 Found ${urlsToProcess.length} URLs that need processing out of ${pages.length} total URLs`); await batch( async (url) => { @@ -136,7 +173,7 @@ export const index = async (input: string = process.argv[2], options: IndexOptio pagesPerStatus[result.status] = pagesPerStatus[result.status] ? [...pagesPerStatus[result.status], url] : [url]; }, - pages, + urlsToProcess, 50, (batchIndex, batchCount) => { console.log(`📦 Batch ${batchIndex + 1} of ${batchCount} complete`); diff --git a/src/shared/types.ts b/src/shared/types.ts index db66fb6..5572c2c 100644 --- a/src/shared/types.ts +++ b/src/shared/types.ts @@ -11,4 +11,5 @@ export enum Status { RateLimited = "RateLimited", Forbidden = "Forbidden", Error = "Error", + NotFound = "Not found (404)", }