fix: strip XML-illegal control characters from generated documents (#5760)

* fix: strip XML-illegal control characters from generated documents

OOXML documents (.docx/.xlsx/.pptx) embed their text directly into
internal XML parts (e.g. word/document.xml). XML 1.0 §2.2 forbids every
C0 control character except tab, line feed, and carriage return. When an
LLM emits LaTeX such as `\frac`, the `\f` decodes (via the tool-call JSON
escape) to a form feed (U+000C) which lands in the content unchanged. The
generated file is a valid ZIP but Office refuses to open it ("Word
experienced an error trying to open the file.").

Add a shared `stripInvalidXmlChars` helper to the create-files manager
that removes the illegal characters from a string or, recursively, from
every string in a nested array/object. Apply it to the user/LLM-supplied
content in the docx, pdf, xlsx, and pptx handlers before generation.
Plain-text output is left untouched since control characters are legal
there.

Resolves #5756

* unset binary format

* update test suite to catch drifts

* lint

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Sanidhya Singh
2026-06-05 19:44:35 +05:30
committed by GitHub
parent 54b530942b
commit cba598edb5
5 changed files with 143 additions and 1 deletions

View File

@@ -0,0 +1,86 @@
/* eslint-env jest */
const createFilesLib = require("../../../../../../utils/agents/aibitat/plugins/create-files/lib.js");
describe("CreateFilesManager.stripInvalidXmlChars", () => {
test("removes the form feed produced by a LaTeX backslash sequence", () => {
// `\frac` arrives as a JSON "\f" escape that decodes to U+000C (form feed),
// which is illegal in XML 1.0 and corrupts OOXML documents.
const content = "En la fracción $\x0Crac{3}{5}$";
const cleaned = createFilesLib.stripInvalidXmlChars(content);
expect(cleaned).toBe("En la fracción $rac{3}{5}$");
expect(cleaned).not.toMatch(/[\x00-\x08\x0B\x0C\x0E-\x1F]/);
});
test("strips every disallowed C0 control character", () => {
const dirty = "a\x00b\x08c\x0Bd\x0Ce\x1Ff";
expect(createFilesLib.stripInvalidXmlChars(dirty)).toBe("abcdef");
});
test("preserves tab, line feed, and carriage return (the legal C0 chars)", () => {
const content = "line1\tcol2\nline2\r\nline3";
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("leaves clean strings unchanged", () => {
const content = "# Title\n\nA normal paragraph with **bold** text.";
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("preserves typical markdown document content", () => {
const content = [
"# Quarterly Report\n",
"## Summary\n",
"Revenue grew **15%** year-over-year.\n",
"- Item 1: $1,200\n- Item 2: $3,400\n",
"| Column A | Column B |\n|----------|----------|\n| value | value |",
].join("\n");
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("preserves unicode, accented characters, and emoji", () => {
const content = "Ñoño résumé naïve — «quotes» 日本語 🎉👍";
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("preserves HTML tags that appear in rich content", () => {
const content =
'<h1>Title</h1>\n<p style="color:red">Hello &amp; goodbye</p>';
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("preserves code blocks and special syntax", () => {
const content =
"```javascript\nconst x = () => { return 42; };\n```\n\n$E = mc^2$";
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("preserves backslash sequences that are NOT control characters", () => {
const content =
"Use \\textbf{bold} and \\newline and C:\\Users\\file.txt";
expect(createFilesLib.stripInvalidXmlChars(content)).toBe(content);
});
test("recursively cleans arrays and nested objects", () => {
const sheets = [
{
name: "Sheet\x0C1",
csvData: "a,b\n1\x00,2",
options: { headerStyle: true, autoFit: 1 },
},
];
expect(createFilesLib.stripInvalidXmlChars(sheets)).toEqual([
{
name: "Sheet1",
csvData: "a,b\n1,2",
options: { headerStyle: true, autoFit: 1 },
},
]);
});
test("returns non-string scalars untouched", () => {
expect(createFilesLib.stripInvalidXmlChars(null)).toBeNull();
expect(createFilesLib.stripInvalidXmlChars(undefined)).toBeUndefined();
expect(createFilesLib.stripInvalidXmlChars(42)).toBe(42);
expect(createFilesLib.stripInvalidXmlChars(true)).toBe(true);
});
});

View File

@@ -122,6 +122,13 @@ module.exports.CreateDocxFile = {
try {
this.super.handlerProps.log(`Using the create-docx-file tool.`);
// Strip XML 1.0 illegal control characters (e.g. the form feed a
// LaTeX `\frac` decodes to) so Word can open the generated file.
content = createFilesLib.stripInvalidXmlChars(content);
title = createFilesLib.stripInvalidXmlChars(title);
subtitle = createFilesLib.stripInvalidXmlChars(subtitle);
author = createFilesLib.stripInvalidXmlChars(author);
const hasExtension = /\.docx$/i.test(filename);
if (!hasExtension) filename = `${filename}.docx`;
const displayFilename = filename.split("/").pop();

View File

@@ -268,6 +268,39 @@ class CreateFilesManager {
.substring(0, 255);
}
/**
* Removes characters that are illegal in XML 1.0 from a string, or - when
* given an array/object - recursively from every string it contains.
*
* OOXML documents (.docx/.xlsx/.pptx) embed their text directly into internal
* XML parts (e.g. word/document.xml). XML 1.0 §2.2 forbids every C0 control
* character except tab (U+0009), line feed (U+000A) and carriage return
* (U+000D). When one of the forbidden characters reaches the content the file
* is still a valid ZIP, but Office refuses to open it ("Word experienced an
* error trying to open the file."). The most common offender is a form feed
* (U+000C): an LLM that emits LaTeX such as `\frac` produces a `\f` JSON
* escape that decodes to U+000C before it ever reaches the generator.
*
* Stripping these characters yields a readable document instead of a corrupt
* one. Non-string scalars are returned untouched.
* @param {*} value - A string, or an array/object that may contain strings.
* @returns {*} The value with all invalid XML characters removed.
*/
stripInvalidXmlChars(value) {
if (typeof value === "string")
// eslint-disable-next-line no-control-regex
return value.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, "");
if (Array.isArray(value))
return value.map((item) => this.stripInvalidXmlChars(item));
if (value && typeof value === "object") {
const cleaned = {};
for (const [key, val] of Object.entries(value))
cleaned[key] = this.stripInvalidXmlChars(val);
return cleaned;
}
return value;
}
/**
* Gets the AnythingLLM logo for branding.
* @param {Object} options

View File

@@ -169,6 +169,11 @@ module.exports.CreatePptxPresentation = {
`Using the create-pptx-presentation tool.`
);
// Strip XML 1.0 illegal control characters so PowerPoint can open
// the generated deck (slide content is sanitized after assembly).
title = createFilesLib.stripInvalidXmlChars(title);
author = createFilesLib.stripInvalidXmlChars(author);
if (!filename.toLowerCase().endsWith(".pptx"))
filename += ".pptx";
@@ -250,12 +255,18 @@ module.exports.CreatePptxPresentation = {
const totalSlideCount = allSlides.length;
// Sub-agent output can carry XML 1.0 illegal control characters
// (e.g. a form feed from a LaTeX `\frac`); strip them recursively
// from every slide so PowerPoint can open the generated deck.
const cleanSlides =
createFilesLib.stripInvalidXmlChars(allSlides);
// Title slide
const titleSlide = pptx.addSlide();
renderTitleSlide(titleSlide, pptx, { title, author }, theme);
// Render every slide produced by the section agents
allSlides.forEach((slideData, index) => {
cleanSlides.forEach((slideData, index) => {
const slide = pptx.addSlide();
const slideNumber = index + 1;
const layout = slideData.layout || "content";

View File

@@ -167,6 +167,11 @@ module.exports.CreateExcelFile = {
try {
this.super.handlerProps.log(`Using the create-excel-file tool.`);
// Strip XML 1.0 illegal control characters from all cell content
// and sheet names so Excel can open the generated workbook.
csvData = createFilesLib.stripInvalidXmlChars(csvData);
sheets = createFilesLib.stripInvalidXmlChars(sheets);
const hasExtension = /\.xlsx$/i.test(filename);
if (!hasExtension) filename = `${filename}.xlsx`;