mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
* fix(ci): make LDR research workflow honestly fail on Python crash A real run (job 77511717371, PR #4225) crashed with glibc 'double free or corruption (!prev)' but the workflow reported success and the caller posted a hollow PR comment. Two cooperating defects: the script's exit code was discarded inside set +e / set -e, and `jq .` exits 0 on a zero-byte response.json so the JSON-shape check passed on empty input. Capture the exit code, harden the validation order (exit -> non-empty -> jq -e shape -> .error -> .research non-empty), tee stderr to a log surfaced in the ::error:: annotation, upload the artifact with if: always() so failed runs leave debuggable evidence, and flush stdout in a finally block in the script so a SIGABRT during interpreter shutdown after json.dumps can't drop the otherwise-completed output. Matches the house pattern from dockle.yml and the jq -e idiom from release-gate.yml. * chore(ci): enable faulthandler in ldr-research.py Dumps a Python traceback to stderr on SIGABRT/SIGSEGV/SIGFPE/SIGBUS/SIGILL before the signal re-fires. Pairs with the stderr-capture plumbing earlier in this PR: on the next glibc abort the ::error:: annotation will show "Fatal Python error: Aborted" plus the actual Python stack frame, making the deps-level investigation possible without a re-run. Verified locally: a deliberately aborted child process emits its frame through faulthandler before exiting on signal 6.
201 lines
6.4 KiB
Python
Executable File
201 lines
6.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
LDR Research Script
|
|
|
|
Reads a research query from stdin and uses Local Deep Research to find
|
|
relevant documentation, sources, and context. Returns JSON with the
|
|
research output.
|
|
|
|
Usage:
|
|
# Pipe a query from stdin
|
|
echo "What is RAG?" | python scripts/ldr-research.py
|
|
|
|
# Or pass a file
|
|
python scripts/ldr-research.py < query.txt
|
|
|
|
# With CLI arguments (easier local testing)
|
|
python scripts/ldr-research.py --provider openrouter --model gpt-4o < query.txt
|
|
|
|
Output: JSON with research results, sources, and findings.
|
|
|
|
Environment variables (can be overridden by CLI args):
|
|
OPENROUTER_API_KEY - API key for OpenRouter
|
|
SERPER_API_KEY - API key for Serper.dev search
|
|
LDR_PROVIDER - LLM provider (default: openrouter)
|
|
LDR_SEARCH_TOOL - Search tool (default: serper)
|
|
LDR_RESEARCH_MODEL - Model name (default: google/gemini-2.0-flash-001 for openrouter)
|
|
LDR_STRATEGY - Search strategy (default: langgraph-agent)
|
|
|
|
Note: This uses the programmatic API and does NOT require a running LDR server.
|
|
"""
|
|
|
|
import argparse
|
|
import faulthandler
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
# Dump a Python traceback to stderr on SIGABRT/SIGSEGV/SIGFPE/SIGBUS/SIGILL.
|
|
faulthandler.enable()
|
|
|
|
|
|
def make_serializable(obj):
|
|
"""Convert objects to JSON-serializable format."""
|
|
if obj is None:
|
|
return None
|
|
if isinstance(obj, (str, int, float, bool)):
|
|
return obj
|
|
if isinstance(obj, dict):
|
|
return {k: make_serializable(v) for k, v in obj.items()}
|
|
if isinstance(obj, (list, tuple)):
|
|
return [make_serializable(item) for item in obj]
|
|
# Handle LangChain Document objects
|
|
if hasattr(obj, "page_content") and hasattr(obj, "metadata"):
|
|
return {
|
|
"content": obj.page_content,
|
|
"metadata": make_serializable(obj.metadata),
|
|
}
|
|
# Handle other objects with __dict__
|
|
if hasattr(obj, "__dict__"):
|
|
return make_serializable(obj.__dict__)
|
|
# Fallback to string representation
|
|
return str(obj)
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description="Run LDR research on a query read from stdin"
|
|
)
|
|
parser.add_argument(
|
|
"--provider",
|
|
default=os.environ.get("LDR_PROVIDER", "openrouter"),
|
|
help="LLM provider (default: openrouter)",
|
|
)
|
|
parser.add_argument(
|
|
"--search-tool",
|
|
default=os.environ.get("LDR_SEARCH_TOOL", "serper"),
|
|
help="Search tool (default: serper)",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
default=os.environ.get("LDR_RESEARCH_MODEL"),
|
|
help="Model name (default: provider's default)",
|
|
)
|
|
parser.add_argument(
|
|
"--iterations",
|
|
type=int,
|
|
default=None,
|
|
help=(
|
|
"Number of research iterations. If unset, the strategy uses "
|
|
"its own default (e.g. langgraph-agent reads "
|
|
"langgraph_agent.max_iterations from settings)."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--strategy",
|
|
default=os.environ.get("LDR_STRATEGY", "langgraph-agent"),
|
|
help="Search strategy name (default: langgraph-agent)",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
# Flush in finally: SIGABRT during interpreter shutdown won't drain the stdout buffer.
|
|
try:
|
|
args = parse_args()
|
|
|
|
# Read query from stdin
|
|
query = sys.stdin.read().strip()
|
|
if not query:
|
|
print(json.dumps({"error": "No query provided on stdin"}))
|
|
sys.exit(1)
|
|
|
|
# Default model for OpenRouter if not specified
|
|
model_name = args.model
|
|
if not model_name and args.provider == "openrouter":
|
|
model_name = "google/gemini-2.0-flash-001"
|
|
|
|
# Check required API keys
|
|
if args.provider == "openrouter" and not os.environ.get(
|
|
"OPENROUTER_API_KEY"
|
|
):
|
|
print(json.dumps({"error": "OPENROUTER_API_KEY not set"}))
|
|
sys.exit(1)
|
|
|
|
if args.search_tool == "serper" and not os.environ.get(
|
|
"SERPER_API_KEY"
|
|
):
|
|
print(json.dumps({"error": "SERPER_API_KEY not set"}))
|
|
sys.exit(1)
|
|
|
|
try:
|
|
from local_deep_research.api import quick_summary
|
|
from local_deep_research.api.settings_utils import (
|
|
create_settings_snapshot,
|
|
)
|
|
|
|
# Build settings overrides
|
|
overrides = {
|
|
"search.tool": args.search_tool,
|
|
"llm.provider": args.provider,
|
|
}
|
|
if model_name:
|
|
overrides["llm.model"] = model_name
|
|
|
|
# Add API keys from environment
|
|
if os.environ.get("OPENROUTER_API_KEY"):
|
|
overrides["llm.openrouter.api_key"] = os.environ[
|
|
"OPENROUTER_API_KEY"
|
|
]
|
|
if os.environ.get("SERPER_API_KEY"):
|
|
overrides["search.engine.web.serper.api_key"] = os.environ[
|
|
"SERPER_API_KEY"
|
|
]
|
|
|
|
settings = create_settings_snapshot(overrides=overrides)
|
|
|
|
# Build kwargs
|
|
kwargs = {
|
|
"query": query,
|
|
"provider": args.provider,
|
|
"search_tool": args.search_tool,
|
|
"settings_snapshot": settings,
|
|
"programmatic_mode": True,
|
|
"search_strategy": args.strategy,
|
|
}
|
|
if model_name:
|
|
kwargs["model_name"] = model_name
|
|
if args.iterations is not None:
|
|
kwargs["iterations"] = args.iterations
|
|
|
|
result = quick_summary(**kwargs)
|
|
|
|
# Use formatted_findings if available (already properly formatted with sources)
|
|
# Fall back to summary if not
|
|
research_output = result.get("formatted_findings") or result.get(
|
|
"summary", str(result)
|
|
)
|
|
|
|
# Build output - make sure everything is JSON serializable
|
|
output = {
|
|
"research": research_output,
|
|
"sources": make_serializable(result.get("sources", [])),
|
|
"findings": make_serializable(result.get("findings", [])),
|
|
"iterations": result.get("iterations"),
|
|
}
|
|
|
|
print(json.dumps(output))
|
|
|
|
except Exception as e:
|
|
print(json.dumps({"error": str(e)}))
|
|
sys.exit(1)
|
|
finally:
|
|
try:
|
|
sys.stdout.flush()
|
|
except Exception: # noqa: silent-exception
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|