diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index eb27b3bbc..737db8d9c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -18,3 +18,7 @@ export async function scrapeDOCX(meta: Meta): Promise { proxyUsed: "basic", }; } + +export function docxMaxReasonableTime(meta: Meta): number { + return 15000; +} diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 1ab8baca9..99bd7c545 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -113,3 +113,7 @@ export async function scrapeURLWithFetch( proxyUsed: "basic", }; } + +export function fetchMaxReasonableTime(meta: Meta): number { + return 15000; +} diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 71ec86c24..4c59cb3a4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -411,3 +411,18 @@ export async function scrapeURLWithFireEngineTLSClient( proxyUsed: response.usedMobileProxy ? "stealth" : "basic", }; } + +export function fireEngineMaxReasonableTime(meta: Meta, engine: "chrome-cdp" | "playwright" | "tlsclient"): number { + if (engine === "tlsclient") { + return 15000; + } else if (engine === "playwright") { + return (meta.options.waitFor ?? 0) + 30000; + } else { + return (meta.options.waitFor ?? 0) + + (meta.options.actions?.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 2500) + a : 250 + a), + 0 + ) ?? 0) + + 30000; + } +} diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 9e4d380a8..fdeaad355 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -1,15 +1,16 @@ import { ScrapeActionContent } from "../../../lib/entities"; import { Meta } from ".."; -import { scrapeDOCX } from "./docx"; +import { docxMaxReasonableTime, scrapeDOCX } from "./docx"; import { + fireEngineMaxReasonableTime, scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient, } from "./fire-engine"; -import { scrapePDF } from "./pdf"; -import { scrapeURLWithFetch } from "./fetch"; -import { scrapeURLWithPlaywright } from "./playwright"; -import { scrapeURLWithIndex } from "./index/index"; +import { pdfMaxReasonableTime, scrapePDF } from "./pdf"; +import { fetchMaxReasonableTime, scrapeURLWithFetch } from "./fetch"; +import { playwrightMaxReasonableTime, scrapeURLWithPlaywright } from "./playwright"; +import { indexMaxReasonableTime, scrapeURLWithIndex } from "./index/index"; import { useIndex } from "../../../services"; import { hasFormatOfType } from "../../../lib/format-utils"; @@ -145,6 +146,25 @@ const engineHandlers: { docx: scrapeDOCX, }; +const engineMRTs: { + [E in Engine]: (meta: Meta) => number; +} = { + "index": indexMaxReasonableTime, + "index;documents": indexMaxReasonableTime, + "fire-engine;chrome-cdp": (meta) => fireEngineMaxReasonableTime(meta, "chrome-cdp"), + "fire-engine(retry);chrome-cdp": (meta) => fireEngineMaxReasonableTime(meta, "chrome-cdp"), + "fire-engine;chrome-cdp;stealth": (meta) => fireEngineMaxReasonableTime(meta, "chrome-cdp"), + "fire-engine(retry);chrome-cdp;stealth": (meta) => fireEngineMaxReasonableTime(meta, "chrome-cdp"), + "fire-engine;playwright": (meta) => fireEngineMaxReasonableTime(meta, "playwright"), + "fire-engine;playwright;stealth": (meta) => fireEngineMaxReasonableTime(meta, "playwright"), + "fire-engine;tlsclient": (meta) => fireEngineMaxReasonableTime(meta, "tlsclient"), + "fire-engine;tlsclient;stealth": (meta) => fireEngineMaxReasonableTime(meta, "tlsclient"), + playwright: playwrightMaxReasonableTime, + fetch: fetchMaxReasonableTime, + pdf: pdfMaxReasonableTime, + docx: docxMaxReasonableTime, +}; + export const engineOptions: { [E in Engine]: { // A list of feature flags the engine supports. @@ -525,3 +545,13 @@ export async function scrapeURLWithEngine( return await fn(_meta); } + +export function getEngineMaxReasonableTime(meta: Meta, engine: Engine): number { + const mrt = engineMRTs[engine]; + // shan't happen - mogery + if (mrt === undefined) { + meta.logger.warn("No MRT for engine", { engine }); + return 30000; + } + return mrt(meta); +} diff --git a/apps/api/src/scraper/scrapeURL/engines/index/index.ts b/apps/api/src/scraper/scrapeURL/engines/index/index.ts index 769350b26..d02e21f31 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts @@ -237,3 +237,7 @@ export async function scrapeURLWithIndex(meta: Meta): Promise { let result: EngineScrapeResultWithContext | null = null; while (remainingEngines.length > 0) { - // TODO: REPLACE WITH Engine.maxReasonableTime TODOv2 - const waitUntilWaterfall = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(remainingEngines.length, 2)) - : (!meta.options.actions && !hasFormatOfType(meta.options.formats, "json")) - ? Math.round(120000 / Math.min(remainingEngines.length, 2)) - : Math.round(300000 / Math.min(remainingEngines.length, 2)); - const { engine, unsupportedFeatures } = remainingEngines.shift()!; + + const waitUntilWaterfall = getEngineMaxReasonableTime(meta, engine); if (!isFinite(waitUntilWaterfall) || isNaN(waitUntilWaterfall) || waitUntilWaterfall <= 0) { meta.logger.warn("Invalid waitUntilWaterfall value", { @@ -404,11 +400,11 @@ async function scrapeURLLoop(meta: Meta): Promise { try { result = await Promise.race([ ...enginePromises.map(x => x.promise), - new Promise((_, reject) => { + ...(remainingEngines.length > 0 ? [new Promise((_, reject) => { setTimeout(() => { reject(new WaterfallNextEngineSignal()); }, waitUntilWaterfall); - }), + })] : []), new Promise((_, reject) => { setTimeout(() => { try { @@ -449,8 +445,8 @@ async function scrapeURLLoop(meta: Meta): Promise { meta.logger.warn("LLM refusal encountered", { error: error.error }); throw error.error; } else if (error.error instanceof FEPageLoadFailed) { - meta.logger.warn("FEPageLoadFailed encountered!!", { error: error.error }); - // TODO: what to do about this? TODOv2 + // This is the internal timeout bug on f-e and should be treated as an EngineError. + meta.logger.warn("FEPageLoadFailed encountered", { error: error.error }); } else if (error.error instanceof AbortManagerThrownError) { if (error.error.tier === "engine") { meta.logger.warn("Engine " + error.engine + " timed out while scraping.", { error: error.error });