Files
local-deep-research/scripts/ldr-research.py
LearningCircuit bc45bbf2e7 fix(ci): make LDR research workflow honestly fail on Python crash (#4226)
* fix(ci): make LDR research workflow honestly fail on Python crash

A real run (job 77511717371, PR #4225) crashed with glibc 'double free or
corruption (!prev)' but the workflow reported success and the caller posted
a hollow PR comment. Two cooperating defects: the script's exit code was
discarded inside set +e / set -e, and `jq .` exits 0 on a zero-byte
response.json so the JSON-shape check passed on empty input.

Capture the exit code, harden the validation order (exit -> non-empty ->
jq -e shape -> .error -> .research non-empty), tee stderr to a log surfaced
in the ::error:: annotation, upload the artifact with if: always() so failed
runs leave debuggable evidence, and flush stdout in a finally block in the
script so a SIGABRT during interpreter shutdown after json.dumps can't drop
the otherwise-completed output. Matches the house pattern from dockle.yml
and the jq -e idiom from release-gate.yml.

* chore(ci): enable faulthandler in ldr-research.py

Dumps a Python traceback to stderr on SIGABRT/SIGSEGV/SIGFPE/SIGBUS/SIGILL
before the signal re-fires. Pairs with the stderr-capture plumbing earlier
in this PR: on the next glibc abort the ::error:: annotation will show
"Fatal Python error: Aborted" plus the actual Python stack frame, making
the deps-level investigation possible without a re-run.

Verified locally: a deliberately aborted child process emits its frame
through faulthandler before exiting on signal 6.
2026-05-24 09:45:14 +02:00

201 lines
6.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""
LDR Research Script
Reads a research query from stdin and uses Local Deep Research to find
relevant documentation, sources, and context. Returns JSON with the
research output.
Usage:
# Pipe a query from stdin
echo "What is RAG?" | python scripts/ldr-research.py
# Or pass a file
python scripts/ldr-research.py < query.txt
# With CLI arguments (easier local testing)
python scripts/ldr-research.py --provider openrouter --model gpt-4o < query.txt
Output: JSON with research results, sources, and findings.
Environment variables (can be overridden by CLI args):
OPENROUTER_API_KEY - API key for OpenRouter
SERPER_API_KEY - API key for Serper.dev search
LDR_PROVIDER - LLM provider (default: openrouter)
LDR_SEARCH_TOOL - Search tool (default: serper)
LDR_RESEARCH_MODEL - Model name (default: google/gemini-2.0-flash-001 for openrouter)
LDR_STRATEGY - Search strategy (default: langgraph-agent)
Note: This uses the programmatic API and does NOT require a running LDR server.
"""
import argparse
import faulthandler
import json
import os
import sys
# Dump a Python traceback to stderr on SIGABRT/SIGSEGV/SIGFPE/SIGBUS/SIGILL.
faulthandler.enable()
def make_serializable(obj):
"""Convert objects to JSON-serializable format."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, dict):
return {k: make_serializable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [make_serializable(item) for item in obj]
# Handle LangChain Document objects
if hasattr(obj, "page_content") and hasattr(obj, "metadata"):
return {
"content": obj.page_content,
"metadata": make_serializable(obj.metadata),
}
# Handle other objects with __dict__
if hasattr(obj, "__dict__"):
return make_serializable(obj.__dict__)
# Fallback to string representation
return str(obj)
def parse_args():
parser = argparse.ArgumentParser(
description="Run LDR research on a query read from stdin"
)
parser.add_argument(
"--provider",
default=os.environ.get("LDR_PROVIDER", "openrouter"),
help="LLM provider (default: openrouter)",
)
parser.add_argument(
"--search-tool",
default=os.environ.get("LDR_SEARCH_TOOL", "serper"),
help="Search tool (default: serper)",
)
parser.add_argument(
"--model",
default=os.environ.get("LDR_RESEARCH_MODEL"),
help="Model name (default: provider's default)",
)
parser.add_argument(
"--iterations",
type=int,
default=None,
help=(
"Number of research iterations. If unset, the strategy uses "
"its own default (e.g. langgraph-agent reads "
"langgraph_agent.max_iterations from settings)."
),
)
parser.add_argument(
"--strategy",
default=os.environ.get("LDR_STRATEGY", "langgraph-agent"),
help="Search strategy name (default: langgraph-agent)",
)
return parser.parse_args()
def main():
# Flush in finally: SIGABRT during interpreter shutdown won't drain the stdout buffer.
try:
args = parse_args()
# Read query from stdin
query = sys.stdin.read().strip()
if not query:
print(json.dumps({"error": "No query provided on stdin"}))
sys.exit(1)
# Default model for OpenRouter if not specified
model_name = args.model
if not model_name and args.provider == "openrouter":
model_name = "google/gemini-2.0-flash-001"
# Check required API keys
if args.provider == "openrouter" and not os.environ.get(
"OPENROUTER_API_KEY"
):
print(json.dumps({"error": "OPENROUTER_API_KEY not set"}))
sys.exit(1)
if args.search_tool == "serper" and not os.environ.get(
"SERPER_API_KEY"
):
print(json.dumps({"error": "SERPER_API_KEY not set"}))
sys.exit(1)
try:
from local_deep_research.api import quick_summary
from local_deep_research.api.settings_utils import (
create_settings_snapshot,
)
# Build settings overrides
overrides = {
"search.tool": args.search_tool,
"llm.provider": args.provider,
}
if model_name:
overrides["llm.model"] = model_name
# Add API keys from environment
if os.environ.get("OPENROUTER_API_KEY"):
overrides["llm.openrouter.api_key"] = os.environ[
"OPENROUTER_API_KEY"
]
if os.environ.get("SERPER_API_KEY"):
overrides["search.engine.web.serper.api_key"] = os.environ[
"SERPER_API_KEY"
]
settings = create_settings_snapshot(overrides=overrides)
# Build kwargs
kwargs = {
"query": query,
"provider": args.provider,
"search_tool": args.search_tool,
"settings_snapshot": settings,
"programmatic_mode": True,
"search_strategy": args.strategy,
}
if model_name:
kwargs["model_name"] = model_name
if args.iterations is not None:
kwargs["iterations"] = args.iterations
result = quick_summary(**kwargs)
# Use formatted_findings if available (already properly formatted with sources)
# Fall back to summary if not
research_output = result.get("formatted_findings") or result.get(
"summary", str(result)
)
# Build output - make sure everything is JSON serializable
output = {
"research": research_output,
"sources": make_serializable(result.get("sources", [])),
"findings": make_serializable(result.get("findings", [])),
"iterations": result.get("iterations"),
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"error": str(e)}))
sys.exit(1)
finally:
try:
sys.stdout.flush()
except Exception: # noqa: silent-exception
pass
if __name__ == "__main__":
main()