diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 6a06868925..82b24c222e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -509,7 +509,7 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio v0DisableJsDom: pageOptions.disableJsDom, v0UseFastMode: pageOptions.useFastMode, }, - // TODO: fallback, fetchPage Content, replaceAllPathsWithAbsolutePaths, includeLinks + // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks } } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index c8ea6aefa4..db2b021f0a 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -87,8 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise Document | Promise; @@ -110,6 +111,7 @@ export const transformerStack: Transformer[] = [ uploadScreenshot, performLLMExtract, coerceFieldsToFormats, + removeBase64Images, ]; export async function executeTransformers(meta: Meta, document: Document): Promise { diff --git a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts new file mode 100644 index 0000000000..92628f8af2 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts @@ -0,0 +1,11 @@ +import { Meta } from ".."; +import { Document } from "../../../controllers/v1/types"; + +const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g; + +export function removeBase64Images(meta: Meta, document: Document): Document { + if (meta.options.removeBase64Images && document.markdown !== undefined) { + document.markdown = document.markdown.replace(regex, '$1()'); + } + return document; +} \ No newline at end of file