mirror of
https://github.com/firecrawl/firecrawl.git
synced 2026-06-16 11:01:38 +03:00
240 lines
6.9 KiB
TypeScript
240 lines
6.9 KiB
TypeScript
import { axiosTimeout } from "../../lib/timeout";
|
|
import { parseStringPromise } from "xml2js";
|
|
import { WebCrawler } from "./crawler";
|
|
import { scrapeURL } from "../scrapeURL";
|
|
import { scrapeOptions } from "../../controllers/v1/types";
|
|
import type { Logger } from "winston";
|
|
const useFireEngine =
|
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
|
export async function getLinksFromSitemap(
|
|
{
|
|
sitemapUrl,
|
|
urlsHandler,
|
|
mode = "axios",
|
|
}: {
|
|
sitemapUrl: string;
|
|
urlsHandler(urls: string[]): unknown;
|
|
mode?: "axios" | "fire-engine";
|
|
},
|
|
logger: Logger,
|
|
): Promise<number> {
|
|
try {
|
|
let content: string = "";
|
|
try {
|
|
if (mode === "fire-engine" && useFireEngine) {
|
|
const fetchResponse = await scrapeURL(
|
|
"sitemap",
|
|
sitemapUrl,
|
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
|
{ forceEngine: "fetch" },
|
|
);
|
|
|
|
if (
|
|
fetchResponse.success &&
|
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
|
fetchResponse.document.metadata.statusCode < 300
|
|
) {
|
|
content = fetchResponse.document.rawHtml!;
|
|
} else {
|
|
// logger.debug(
|
|
// "Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
|
// {
|
|
// error: fetchResponse.success
|
|
// ? fetchResponse.document
|
|
// : fetchResponse.error,
|
|
// },
|
|
// );
|
|
|
|
// const tlsResponse = await scrapeURL(
|
|
// "sitemap",
|
|
// sitemapUrl,
|
|
// scrapeOptions.parse({ formats: ["rawHtml"] }),
|
|
// { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
|
// );
|
|
|
|
// if (
|
|
// tlsResponse.success &&
|
|
// tlsResponse.document.metadata.statusCode >= 200 &&
|
|
// tlsResponse.document.metadata.statusCode < 300
|
|
// ) {
|
|
// content = tlsResponse.document.rawHtml!;
|
|
// } else {
|
|
logger.error(
|
|
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
|
{
|
|
method: "getLinksFromSitemap",
|
|
mode,
|
|
sitemapUrl,
|
|
// error: tlsResponse.success
|
|
// ? tlsResponse.document
|
|
// : tlsResponse.error,
|
|
error: fetchResponse.success
|
|
? fetchResponse.document
|
|
: fetchResponse.error,
|
|
},
|
|
);
|
|
return 0;
|
|
// }
|
|
}
|
|
} else {
|
|
const fetchResponse = await scrapeURL(
|
|
"sitemap",
|
|
sitemapUrl,
|
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
|
{ forceEngine: "fetch" },
|
|
);
|
|
|
|
if (
|
|
fetchResponse.success &&
|
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
|
fetchResponse.document.metadata.statusCode < 300
|
|
) {
|
|
content = fetchResponse.document.rawHtml!;
|
|
} else {
|
|
logger.error(
|
|
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
|
{
|
|
method: "getLinksFromSitemap",
|
|
mode,
|
|
sitemapUrl,
|
|
},
|
|
);
|
|
return 0;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Request failed for ${sitemapUrl}`, {
|
|
method: "getLinksFromSitemap",
|
|
mode,
|
|
sitemapUrl,
|
|
error,
|
|
});
|
|
|
|
return 0;
|
|
}
|
|
|
|
const parsed = await parseStringPromise(content);
|
|
const root = parsed.urlset || parsed.sitemapindex;
|
|
let count = 0;
|
|
|
|
if (root && root.sitemap) {
|
|
// Handle sitemap index files
|
|
const sitemapUrls = root.sitemap
|
|
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
|
.map((sitemap) => sitemap.loc[0].trim());
|
|
|
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
|
|
);
|
|
|
|
const results = await Promise.all(sitemapPromises);
|
|
count = results.reduce((a, x) => a + x);
|
|
} else if (root && root.url) {
|
|
// Check if any URLs point to additional sitemaps
|
|
const xmlSitemaps: string[] = root.url
|
|
.filter(
|
|
(url) =>
|
|
url.loc &&
|
|
url.loc.length > 0 &&
|
|
url.loc[0].trim().toLowerCase().endsWith(".xml"),
|
|
)
|
|
.map((url) => url.loc[0].trim());
|
|
|
|
if (xmlSitemaps.length > 0) {
|
|
// Recursively fetch links from additional sitemaps
|
|
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
|
|
getLinksFromSitemap(
|
|
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
|
logger,
|
|
),
|
|
);
|
|
count += (await Promise.all(sitemapPromises)).reduce(
|
|
(a, x) => a + x,
|
|
0,
|
|
);
|
|
}
|
|
|
|
const validUrls = root.url
|
|
.filter(
|
|
(url) =>
|
|
url.loc &&
|
|
url.loc.length > 0 &&
|
|
!url.loc[0].trim().toLowerCase().endsWith(".xml") &&
|
|
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
|
)
|
|
.map((url) => url.loc[0].trim());
|
|
count += validUrls.length;
|
|
|
|
const h = urlsHandler(validUrls);
|
|
if (h instanceof Promise) {
|
|
await h;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
} catch (error) {
|
|
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
|
method: "getLinksFromSitemap",
|
|
mode,
|
|
sitemapUrl,
|
|
error,
|
|
});
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
export const fetchSitemapData = async (
|
|
url: string,
|
|
timeout?: number,
|
|
): Promise<SitemapEntry[] | null> => {
|
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
|
try {
|
|
const fetchResponse = await scrapeURL(
|
|
"sitemap",
|
|
sitemapUrl,
|
|
scrapeOptions.parse({
|
|
formats: ["rawHtml"],
|
|
timeout: timeout || axiosTimeout,
|
|
}),
|
|
{ forceEngine: "fetch" },
|
|
);
|
|
|
|
if (
|
|
fetchResponse.success &&
|
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
|
fetchResponse.document.metadata.statusCode < 300
|
|
) {
|
|
const xml = fetchResponse.document.rawHtml!;
|
|
const parsedXml = await parseStringPromise(xml);
|
|
|
|
const sitemapData: SitemapEntry[] = [];
|
|
if (parsedXml.urlset && parsedXml.urlset.url) {
|
|
for (const urlElement of parsedXml.urlset.url) {
|
|
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
|
|
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
|
|
if (urlElement.changefreq)
|
|
sitemapEntry.changefreq = urlElement.changefreq[0];
|
|
if (urlElement.priority)
|
|
sitemapEntry.priority = Number(urlElement.priority[0]);
|
|
sitemapData.push(sitemapEntry);
|
|
}
|
|
}
|
|
|
|
return sitemapData;
|
|
}
|
|
return null;
|
|
} catch (error) {
|
|
// Error handling for failed sitemap fetch
|
|
}
|
|
return [];
|
|
};
|
|
|
|
export interface SitemapEntry {
|
|
loc: string;
|
|
lastmod?: string;
|
|
changefreq?: string;
|
|
priority?: number;
|
|
}
|