mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2026-06-15 23:20:32 +03:00
fix: strip XML-illegal control characters from generated documents (#5760)
* fix: strip XML-illegal control characters from generated documents
OOXML documents (.docx/.xlsx/.pptx) embed their text directly into
internal XML parts (e.g. word/document.xml). XML 1.0 §2.2 forbids every
C0 control character except tab, line feed, and carriage return. When an
LLM emits LaTeX such as `\frac`, the `\f` decodes (via the tool-call JSON
escape) to a form feed (U+000C) which lands in the content unchanged. The
generated file is a valid ZIP but Office refuses to open it ("Word
experienced an error trying to open the file.").
Add a shared `stripInvalidXmlChars` helper to the create-files manager
that removes the illegal characters from a string or, recursively, from
every string in a nested array/object. Apply it to the user/LLM-supplied
content in the docx, pdf, xlsx, and pptx handlers before generation.
Plain-text output is left untouched since control characters are legal
there.
Resolves #5756
* unset binary format
* update test suite to catch drifts
* lint
---------
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
/* eslint-env jest */
|
||||
const createFilesLib = require("../../../../../../utils/agents/aibitat/plugins/create-files/lib.js");
|
||||
|
||||
describe("CreateFilesManager.stripInvalidXmlChars", () => {
|
||||
test("removes the form feed produced by a LaTeX backslash sequence", () => {
|
||||
// `\frac` arrives as a JSON "\f" escape that decodes to U+000C (form feed),
|
||||
// which is illegal in XML 1.0 and corrupts OOXML documents.
|
||||
const content = "En la fracción $\x0Crac{3}{5}$";
|
||||
const cleaned = createFilesLib.stripInvalidXmlChars(content);
|
||||
expect(cleaned).toBe("En la fracción $rac{3}{5}$");
|
||||
expect(cleaned).not.toMatch(/[\x00-\x08\x0B\x0C\x0E-\x1F]/);
|
||||
});
|
||||
|
||||
test("strips every disallowed C0 control character", () => {
|
||||
const dirty = "a\x00b\x08c\x0Bd\x0Ce\x1Ff";
|
||||
expect(createFilesLib.stripInvalidXmlChars(dirty)).toBe("abcdef");
|
||||
});
|
||||
|
||||
test("preserves tab, line feed, and carriage return (the legal C0 chars)", () => {
|
||||
const content = "line1\tcol2\nline2\r\nline3";
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("leaves clean strings unchanged", () => {
|
||||
const content = "# Title\n\nA normal paragraph with **bold** text.";
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("preserves typical markdown document content", () => {
|
||||
const content = [
|
||||
"# Quarterly Report\n",
|
||||
"## Summary\n",
|
||||
"Revenue grew **15%** year-over-year.\n",
|
||||
"- Item 1: $1,200\n- Item 2: $3,400\n",
|
||||
"| Column A | Column B |\n|----------|----------|\n| value | value |",
|
||||
].join("\n");
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("preserves unicode, accented characters, and emoji", () => {
|
||||
const content = "Ñoño résumé naïve — «quotes» 日本語 🎉👍";
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("preserves HTML tags that appear in rich content", () => {
|
||||
const content =
|
||||
'<h1>Title</h1>\n<p style="color:red">Hello & goodbye</p>';
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("preserves code blocks and special syntax", () => {
|
||||
const content =
|
||||
"```javascript\nconst x = () => { return 42; };\n```\n\n$E = mc^2$";
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("preserves backslash sequences that are NOT control characters", () => {
|
||||
const content =
|
||||
"Use \\textbf{bold} and \\newline and C:\\Users\\file.txt";
|
||||
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
|
||||
});
|
||||
|
||||
test("recursively cleans arrays and nested objects", () => {
|
||||
const sheets = [
|
||||
{
|
||||
name: "Sheet\x0C1",
|
||||
csvData: "a,b\n1\x00,2",
|
||||
options: { headerStyle: true, autoFit: 1 },
|
||||
},
|
||||
];
|
||||
expect(createFilesLib.stripInvalidXmlChars(sheets)).toEqual([
|
||||
{
|
||||
name: "Sheet1",
|
||||
csvData: "a,b\n1,2",
|
||||
options: { headerStyle: true, autoFit: 1 },
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test("returns non-string scalars untouched", () => {
|
||||
expect(createFilesLib.stripInvalidXmlChars(null)).toBeNull();
|
||||
expect(createFilesLib.stripInvalidXmlChars(undefined)).toBeUndefined();
|
||||
expect(createFilesLib.stripInvalidXmlChars(42)).toBe(42);
|
||||
expect(createFilesLib.stripInvalidXmlChars(true)).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -122,6 +122,13 @@ module.exports.CreateDocxFile = {
|
||||
try {
|
||||
this.super.handlerProps.log(`Using the create-docx-file tool.`);
|
||||
|
||||
// Strip XML 1.0 illegal control characters (e.g. the form feed a
|
||||
// LaTeX `\frac` decodes to) so Word can open the generated file.
|
||||
content = createFilesLib.stripInvalidXmlChars(content);
|
||||
title = createFilesLib.stripInvalidXmlChars(title);
|
||||
subtitle = createFilesLib.stripInvalidXmlChars(subtitle);
|
||||
author = createFilesLib.stripInvalidXmlChars(author);
|
||||
|
||||
const hasExtension = /\.docx$/i.test(filename);
|
||||
if (!hasExtension) filename = `${filename}.docx`;
|
||||
const displayFilename = filename.split("/").pop();
|
||||
|
||||
@@ -268,6 +268,39 @@ class CreateFilesManager {
|
||||
.substring(0, 255);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes characters that are illegal in XML 1.0 from a string, or - when
|
||||
* given an array/object - recursively from every string it contains.
|
||||
*
|
||||
* OOXML documents (.docx/.xlsx/.pptx) embed their text directly into internal
|
||||
* XML parts (e.g. word/document.xml). XML 1.0 §2.2 forbids every C0 control
|
||||
* character except tab (U+0009), line feed (U+000A) and carriage return
|
||||
* (U+000D). When one of the forbidden characters reaches the content the file
|
||||
* is still a valid ZIP, but Office refuses to open it ("Word experienced an
|
||||
* error trying to open the file."). The most common offender is a form feed
|
||||
* (U+000C): an LLM that emits LaTeX such as `\frac` produces a `\f` JSON
|
||||
* escape that decodes to U+000C before it ever reaches the generator.
|
||||
*
|
||||
* Stripping these characters yields a readable document instead of a corrupt
|
||||
* one. Non-string scalars are returned untouched.
|
||||
* @param {*} value - A string, or an array/object that may contain strings.
|
||||
* @returns {*} The value with all invalid XML characters removed.
|
||||
*/
|
||||
stripInvalidXmlChars(value) {
|
||||
if (typeof value === "string")
|
||||
// eslint-disable-next-line no-control-regex
|
||||
return value.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, "");
|
||||
if (Array.isArray(value))
|
||||
return value.map((item) => this.stripInvalidXmlChars(item));
|
||||
if (value && typeof value === "object") {
|
||||
const cleaned = {};
|
||||
for (const [key, val] of Object.entries(value))
|
||||
cleaned[key] = this.stripInvalidXmlChars(val);
|
||||
return cleaned;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the AnythingLLM logo for branding.
|
||||
* @param {Object} options
|
||||
|
||||
@@ -169,6 +169,11 @@ module.exports.CreatePptxPresentation = {
|
||||
`Using the create-pptx-presentation tool.`
|
||||
);
|
||||
|
||||
// Strip XML 1.0 illegal control characters so PowerPoint can open
|
||||
// the generated deck (slide content is sanitized after assembly).
|
||||
title = createFilesLib.stripInvalidXmlChars(title);
|
||||
author = createFilesLib.stripInvalidXmlChars(author);
|
||||
|
||||
if (!filename.toLowerCase().endsWith(".pptx"))
|
||||
filename += ".pptx";
|
||||
|
||||
@@ -250,12 +255,18 @@ module.exports.CreatePptxPresentation = {
|
||||
|
||||
const totalSlideCount = allSlides.length;
|
||||
|
||||
// Sub-agent output can carry XML 1.0 illegal control characters
|
||||
// (e.g. a form feed from a LaTeX `\frac`); strip them recursively
|
||||
// from every slide so PowerPoint can open the generated deck.
|
||||
const cleanSlides =
|
||||
createFilesLib.stripInvalidXmlChars(allSlides);
|
||||
|
||||
// Title slide
|
||||
const titleSlide = pptx.addSlide();
|
||||
renderTitleSlide(titleSlide, pptx, { title, author }, theme);
|
||||
|
||||
// Render every slide produced by the section agents
|
||||
allSlides.forEach((slideData, index) => {
|
||||
cleanSlides.forEach((slideData, index) => {
|
||||
const slide = pptx.addSlide();
|
||||
const slideNumber = index + 1;
|
||||
const layout = slideData.layout || "content";
|
||||
|
||||
@@ -167,6 +167,11 @@ module.exports.CreateExcelFile = {
|
||||
try {
|
||||
this.super.handlerProps.log(`Using the create-excel-file tool.`);
|
||||
|
||||
// Strip XML 1.0 illegal control characters from all cell content
|
||||
// and sheet names so Excel can open the generated workbook.
|
||||
csvData = createFilesLib.stripInvalidXmlChars(csvData);
|
||||
sheets = createFilesLib.stripInvalidXmlChars(sheets);
|
||||
|
||||
const hasExtension = /\.xlsx$/i.test(filename);
|
||||
if (!hasExtension) filename = `${filename}.xlsx`;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user