mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
* chore(lint): add ruff rules for logging, performance, exceptions, and print detection Add wave 2 lint rules: G, PERF, RET, TRY, T20, C4, ERA. All existing violations are suppressed via ignore/per-file-ignores so this config change is merge-safe. Follow-up PRs will fix violations and remove the ignore entries incrementally. * fix(lint): exempt pre-commit hooks from T201 print rule (#3270) Pre-commit hooks are CLI scripts where print is the intended output interface, same as scripts/ and cli/ directories already exempted. * fix(lint): fix all low-count ruff violations instead of suppressing them (#3275) * fix(lint): replace manual dict-building loops with dict comprehensions (PERF403) * fix(lint): replace bare Exception raises with specific built-in types (TRY002) Replace all `raise Exception(...)` in production code with appropriate built-in exception types: RuntimeError for operational/state failures, ValueError for invalid data, and ConnectionError for HTTP errors. * fix(lint): resolve TRY004 and PERF402 ruff violations Use TypeError instead of ValueError for isinstance/issubclass type checks (TRY004), and replace manual for-loop list copies with list.extend() (PERF402). * fix(lint): fix all low-count ruff violations instead of suppressing them Fix all violations for 15 ruff rules that had ≤10 occurrences each, rather than suppressing them with ignore directives: - TRY002: raise-vanilla-class → use specific built-in exceptions - TRY004: type-check-without-type-error → use TypeError - C408: unnecessary-collection-call → use dict/list literals - C401: unnecessary-generator-set → use set comprehensions - C416: unnecessary-comprehension → use list()/set() - C414: unnecessary-double-cast-or-process → simplify - PERF403: manual-dict-comprehension → use dict comprehensions - PERF102: incorrect-dict-iterator → use .values()/.keys() - PERF402: manual-list-copy → use list.extend() - RET503/RET506/RET507/RET508: superfluous else after return/raise/continue/break - RET501/RET502: unnecessary/implicit return None Adds per-file-ignores for tests/ and examples/ where these patterns are acceptable (e.g. bare Exception in tests, dict() calls in fixtures). * fix(lint): enforce E722, ERA001, RET505 and fix pre-commit RET503 gap (#3276) Remove three rules from the global ignore list by fixing all violations: E722 (bare except) — 6 violations in tests: Replace `except:` with `except Exception:` to avoid swallowing KeyboardInterrupt and SystemExit. ERA001 (commented-out code) — 25 violations: Delete 18 true positives (dead variables, disabled debug logs, commented-out imports). Add `# noqa: ERA001` to 7 false positives (template instructions, type annotations, documentation comments). RET505 (superfluous else after return) — 413 violations: Auto-fix all occurrences. Also fixes 5 cascading RET506/RET507 violations exposed by the RET505 removals. Pre-commit hooks gap: Add RET503 to `.pre-commit-hooks/**` per-file-ignores alongside T201. * fix(lint): enforce RET504 and TRY301 — fix all violations (#3279) * fix(lint): enforce RET504 — collapse unnecessary assign-before-return Auto-fix all 46 RET504 violations via ruff unsafe-fixes: collapse `result = expr; return result` into `return expr`. Remove RET504 from global ignore list. Add to tests/examples per-file-ignores where intermediate variables aid test clarity. Also removes TRY301 from global ignore (violations fixed in next commit). * fix(lint): enforce TRY301 — fix raises inside broad try/except blocks Structural fixes for 65 TRY301 violations: Security-critical fixes: - url_validator.py: move 6 validation raises before try block, replace isinstance-based re-raise with specific except clause - path_validator.py: move validation outside try block - env_settings.py: separate parsing (try) from validation (outside) Route/service fixes: - research_routes.py: replace raise-then-catch with direct error return - mcp/server.py: move all 7 tool validations before try blocks - news/api.py: move validation before try, noqa for db-session raises - notifications: move rate limit and URL validation before try blocks - iterative_refinement_strategy.py: move JSON validation after try Added noqa for intentional patterns: re-raise in except handlers, nested function definitions, db-session-dependent checks, rate limit re-raises for base class retry logic. * merge: resolve conflicts between wave2 lint branch and main Resolve 14 merge conflicts by always starting from main's version and re-applying lint fixes on top: - mcp_strategy.py, ollama.py, security_settings.py, delete_routes.py: Take main's code, re-apply RET505 (remove else: after return) - mcp/server.py (3 conflicts): Take main's ValidationError handlers and set_settings_context, re-apply TRY301 fixes, fix sensitive data logging - research_routes.py: Take main, fix duplicate block (merge artifact) - settings_routes.py: Take main's default-settings fallback feature - meta_search_engine.py, parallel_search_engine.py: Take main's get_available_engines delegation, delete unreachable code - search_engine_ddg.py, search_engine_google_pse.py: Take main's sanitization, re-apply RET506 (if not elif after raise) - rag_routes.py: Accept main's deletion (route moved to delete_routes) - encryption_check.py: Accept main's deletion (dead code) - test_storage_coverage.py: Remove broken test classes referencing undefined stubs - pre-commit hooks: extend per-file-ignores for ERA001, RET504 * fix: revert ValueError→TypeError changes that break tests and API contracts Revert TRY004 fixes in 3 files where changing ValueError to TypeError would break existing tests and HTTP status code contracts: - card_factory.py: 5 tests assert pytest.raises(ValueError) - base_rater.py: flask_api.py catches ValueError for HTTP 400 responses; TypeError would fall through to HTTP 500 - full_search.py: test asserts pytest.raises(ValueError) Add # noqa: TRY004 to suppress the lint rule on these lines. * fix: move benchmark_data check back inside try block The ValueError for missing benchmark_data must be inside the try/except so the except handler can mark the run as FAILED in the database. Without this, the exception propagates unhandled in a daemon thread, leaving the benchmark run stuck in RUNNING state permanently. * chore(lint): remove ERA rule and suppress TRY004 globally Remove ERA (eradicate — commented-out code detection) from ruff select: - 28% false positive rate in our codebase (7 of 25 violations) - No major Python project enables it (Django, FastAPI, Pydantic, Airflow) - Ruff itself doesn't use it; autofix was demoted to manual-only - 172 noqa suppressions provided zero enforcement value Suppress TRY004 (type-check-without-type-error) globally: - Ruff maintainer agreed the autofix "can change functionality" - We already had to revert 3 TypeError changes that broke tests and HTTP 400→500 API contracts - Django, Flask, pandas all use isinstance + ValueError routinely - Pylint has no equivalent rule; near-zero PyPI adoption Remove all 173 # noqa: ERA001 and 49 # noqa: TRY004 comments from the codebase — no longer needed with rules disabled/suppressed. * fix: resolve mypy errors, failing MCP test, and TRY301 noqa - search_engine_factory.py: restore typed intermediate variable to fix mypy no-any-return (RET504 collapse lost the type annotation) - search_engine_pubchem.py: add explicit list[str] type annotation - test_edge_cases.py: fix assertion that expected engine name in sanitized error message - mcp/server.py: add noqa: TRY301 to validation raises inside try blocks (from main's new merge code)
380 lines
12 KiB
Python
Executable File
380 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Generate CONFIGURATION.md from default settings JSON files and env_definitions.
|
|
|
|
Usage:
|
|
python scripts/generate_config_docs.py # Write to docs/CONFIGURATION.md
|
|
python scripts/generate_config_docs.py --output /tmp/out # Write to custom location
|
|
python scripts/generate_config_docs.py --check # Exit 1 if docs are stale
|
|
"""
|
|
|
|
import argparse
|
|
import ast
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
|
|
def get_project_root() -> Path:
|
|
"""Return the project root directory."""
|
|
return Path(__file__).resolve().parent.parent
|
|
|
|
|
|
def get_env_var_name(key: str) -> str:
|
|
"""Convert setting key to environment variable name."""
|
|
return f"LDR_{key.replace('.', '_').upper()}"
|
|
|
|
|
|
def format_value(value: Any) -> str:
|
|
"""Format default value for markdown."""
|
|
if value is None:
|
|
return "null"
|
|
if isinstance(value, bool):
|
|
return str(value).lower()
|
|
if isinstance(value, (dict, list)):
|
|
return f"`{json.dumps(value)}`"
|
|
return str(value)
|
|
|
|
|
|
def _discover_env_definition_files(env_defs_dir: Path) -> List[Path]:
|
|
"""Auto-discover env_definitions modules, excluding __init__.py and env_settings.py."""
|
|
if not env_defs_dir.is_dir():
|
|
return []
|
|
return sorted(
|
|
p
|
|
for p in env_defs_dir.glob("*.py")
|
|
if p.name not in ("__init__.py", "env_settings.py")
|
|
)
|
|
|
|
|
|
def _category_from_filename(filename: str) -> str:
|
|
"""Derive a human-readable category name from a filename.
|
|
|
|
Example: 'db_config.py' -> 'Db Config'
|
|
"""
|
|
stem = filename.removesuffix(".py")
|
|
return stem.replace("_", " ").title()
|
|
|
|
|
|
def _extract_setting_from_call(node: ast.Call) -> Optional[Dict[str, Any]]:
|
|
"""Extract a setting dict from a *Setting() AST call node."""
|
|
if not (
|
|
isinstance(node.func, ast.Name) and node.func.id.endswith("Setting")
|
|
):
|
|
return None
|
|
|
|
keywords = {k.arg: k.value for k in node.keywords if k.arg}
|
|
|
|
if "key" not in keywords:
|
|
return None
|
|
|
|
key_node = keywords["key"]
|
|
if not isinstance(key_node, ast.Constant):
|
|
return None
|
|
key = key_node.value # gitleaks:allow
|
|
|
|
# Description — may be a simple string or a parenthesised concatenation
|
|
description = ""
|
|
if "description" in keywords:
|
|
desc_node = keywords["description"]
|
|
if isinstance(desc_node, ast.Constant):
|
|
description = desc_node.value
|
|
else:
|
|
description = ast.unparse(desc_node)
|
|
|
|
# Default
|
|
default_val = "None"
|
|
if "default" in keywords:
|
|
default_val = ast.unparse(keywords["default"])
|
|
|
|
# Env var (auto-generated unless explicitly overridden)
|
|
if "env_var" in keywords and isinstance(keywords["env_var"], ast.Constant):
|
|
env_var = keywords["env_var"].value
|
|
else:
|
|
env_var = get_env_var_name(key)
|
|
|
|
# Type from the class name (e.g. BooleanSetting -> Boolean)
|
|
setting_type = node.func.id.replace("Setting", "")
|
|
|
|
# Required
|
|
required = False
|
|
if "required" in keywords and isinstance(
|
|
keywords["required"], ast.Constant
|
|
):
|
|
required = bool(keywords["required"].value)
|
|
|
|
# Min/max value
|
|
min_value = None
|
|
if "min_value" in keywords and isinstance(
|
|
keywords["min_value"], ast.Constant
|
|
):
|
|
min_value = keywords["min_value"].value
|
|
|
|
max_value = None
|
|
if "max_value" in keywords and isinstance(
|
|
keywords["max_value"], ast.Constant
|
|
):
|
|
max_value = keywords["max_value"].value
|
|
|
|
# Allowed values (ast.Set of ast.Constant)
|
|
allowed_values = None
|
|
if "allowed_values" in keywords and isinstance(
|
|
keywords["allowed_values"], ast.Set
|
|
):
|
|
allowed_values = sorted(
|
|
elt.value
|
|
for elt in keywords["allowed_values"].elts
|
|
if isinstance(elt, ast.Constant)
|
|
)
|
|
|
|
# Deprecated env var
|
|
deprecated_env_var = None
|
|
if "deprecated_env_var" in keywords and isinstance(
|
|
keywords["deprecated_env_var"], ast.Constant
|
|
):
|
|
deprecated_env_var = keywords["deprecated_env_var"].value
|
|
|
|
return {
|
|
"key": key,
|
|
"env_var": env_var,
|
|
"description": description,
|
|
"default": default_val,
|
|
"type": setting_type,
|
|
"required": required,
|
|
"min_value": min_value,
|
|
"max_value": max_value,
|
|
"allowed_values": allowed_values,
|
|
"deprecated_env_var": deprecated_env_var,
|
|
}
|
|
|
|
|
|
def get_env_only_settings(
|
|
root_dir: Optional[Path] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract env-only settings from env_definitions/ by auto-discovering modules.
|
|
|
|
These are settings required before database initialization.
|
|
"""
|
|
root_dir = root_dir or get_project_root()
|
|
env_defs_dir = (
|
|
root_dir
|
|
/ "src"
|
|
/ "local_deep_research"
|
|
/ "settings"
|
|
/ "env_definitions"
|
|
)
|
|
|
|
env_only: List[Dict[str, Any]] = []
|
|
|
|
for filepath in _discover_env_definition_files(env_defs_dir):
|
|
category = _category_from_filename(filepath.name)
|
|
|
|
try:
|
|
content = filepath.read_text()
|
|
tree = ast.parse(content)
|
|
except Exception as e:
|
|
print(f"Warning: Could not parse {filepath}: {e}")
|
|
continue
|
|
|
|
for node in ast.walk(tree):
|
|
if not isinstance(node, ast.Call):
|
|
continue
|
|
|
|
setting = _extract_setting_from_call(node)
|
|
if setting is None:
|
|
continue
|
|
|
|
setting["category"] = category
|
|
env_only.append(setting)
|
|
|
|
return env_only
|
|
|
|
|
|
def _format_constraints(setting: Dict[str, Any]) -> str:
|
|
"""Build a human-readable constraints string."""
|
|
parts = []
|
|
if (
|
|
setting.get("min_value") is not None
|
|
and setting.get("max_value") is not None
|
|
):
|
|
parts.append(f"{setting['min_value']}..{setting['max_value']}")
|
|
elif setting.get("min_value") is not None:
|
|
parts.append(f">={setting['min_value']}")
|
|
elif setting.get("max_value") is not None:
|
|
parts.append(f"<={setting['max_value']}")
|
|
|
|
if setting.get("allowed_values"):
|
|
parts.append(", ".join(setting["allowed_values"]))
|
|
|
|
return " | ".join(parts) if parts else ""
|
|
|
|
|
|
def generate_docs_content(root_dir: Optional[Path] = None) -> str:
|
|
"""Generate the full CONFIGURATION.md content as a string."""
|
|
root_dir = root_dir or get_project_root()
|
|
defaults_dir = root_dir / "src" / "local_deep_research" / "defaults"
|
|
|
|
settings: Dict[str, Any] = {}
|
|
|
|
# Recursively find all JSON files
|
|
for json_file in sorted(defaults_dir.rglob("*.json")):
|
|
try:
|
|
with open(json_file, "r") as f:
|
|
data = json.load(f)
|
|
settings.update(data)
|
|
except Exception as e:
|
|
print(f"Warning: Could not load {json_file}: {e}")
|
|
|
|
sorted_keys = sorted(settings.keys())
|
|
|
|
# Get env-only settings
|
|
env_only_settings = get_env_only_settings(root_dir)
|
|
|
|
# Build markdown
|
|
content = [
|
|
"# Configuration Reference",
|
|
"",
|
|
"This document is automatically generated from the application's default settings.",
|
|
"All settings can be configured via the Web UI (Settings page), or overridden via Environment Variables.",
|
|
"",
|
|
"## Environment Variables",
|
|
"",
|
|
"To override a setting using an environment variable, convert the key to uppercase, replace dots with underscores, and prefix with `LDR_`.",
|
|
"For example, `app.debug` becomes `LDR_APP_DEBUG`.",
|
|
"",
|
|
"Configuration Priority: Web UI Config > Environment Variables > Default Values",
|
|
"> Environmental Variables are used to override default values, easing installation, while allowing for adjustments to configuration via Web UI.",
|
|
"",
|
|
"### System Locking",
|
|
"There is a special environment variable `LDR_LOCKED_SETTINGS` that allows administrators to strictly enforce specific settings.",
|
|
"",
|
|
"* **Variable**: `LDR_LOCKED_SETTINGS`",
|
|
"* **Format**: Comma-separated list of setting keys (e.g., `llm.model,app.port`)",
|
|
"* **Behavior**:",
|
|
" 1. Any setting listed here **MUST** have a corresponding value defined in the environment variables (e.g., `LDR_LLM_MODEL`). If not, the application will fail to start.",
|
|
" 2. The setting becomes **read-only** in the Web UI.",
|
|
" 3. The **Environment Variable** value takes absolute precedence, ignoring any value in the database.",
|
|
"",
|
|
"**Priority for Locked Settings**: Environment Variable > Database (Ignored) > Default (Ignored)",
|
|
"",
|
|
"",
|
|
]
|
|
|
|
# Env-only section with expanded columns
|
|
if env_only_settings:
|
|
content.extend(
|
|
[
|
|
"## Pre-Database (Env-Only) Settings",
|
|
"",
|
|
"These settings are **required before database initialization** and can only be set via environment variables.",
|
|
"They are not available in the Web UI because they are needed to start the application.",
|
|
"",
|
|
"| Environment Variable | Type | Default | Required | Constraints | Description | Category | Deprecated Alias |",
|
|
"|----------------------|------|---------|----------|-------------|-------------|----------|------------------|",
|
|
]
|
|
)
|
|
|
|
for setting in sorted(env_only_settings, key=lambda x: x["env_var"]):
|
|
env_var = setting["env_var"]
|
|
stype = setting["type"]
|
|
default = setting["default"]
|
|
required = "Yes" if setting.get("required") else "No"
|
|
constraints = _format_constraints(setting).replace("|", "\\|")
|
|
desc = setting["description"].replace("|", "\\|").replace("\n", " ")
|
|
category = setting["category"]
|
|
deprecated = setting.get("deprecated_env_var") or ""
|
|
|
|
row = (
|
|
f"| `{env_var}` | {stype} | `{default}` | {required} "
|
|
f"| {constraints} | {desc} | {category} | {deprecated} |"
|
|
)
|
|
content.append(row)
|
|
|
|
content.extend(["", ""])
|
|
|
|
# Main settings list
|
|
content.extend(
|
|
[
|
|
"## Settings List",
|
|
"",
|
|
"| Key | Environment Variable | Default Value | Description | Type |",
|
|
"|-----|----------------------|---------------|-------------|------|",
|
|
]
|
|
)
|
|
|
|
for key in sorted_keys:
|
|
setting = settings[key]
|
|
env_var = setting.get("env_var") or get_env_var_name(key)
|
|
default_val = format_value(setting.get("value"))
|
|
description = (
|
|
setting.get("description", "")
|
|
.replace("\n", " ")
|
|
.replace("|", "\\|")
|
|
)
|
|
setting_type = setting.get("type", "UNKNOWN")
|
|
|
|
row = f"| `{key}` | `{env_var}` | `{default_val}` | {description} | {setting_type} |"
|
|
content.append(row)
|
|
|
|
content.append("")
|
|
content.append("*Generated by scripts/generate_config_docs.py*")
|
|
|
|
return "\n".join(content) + "\n"
|
|
|
|
|
|
def generate_docs(
|
|
output_path: Optional[Path] = None,
|
|
check: bool = False,
|
|
) -> int:
|
|
"""Generate (or check) CONFIGURATION.md.
|
|
|
|
Returns 0 on success, 1 if check finds stale docs.
|
|
"""
|
|
root_dir = get_project_root()
|
|
output_file = output_path or (root_dir / "docs" / "CONFIGURATION.md")
|
|
|
|
new_content = generate_docs_content(root_dir)
|
|
|
|
if check:
|
|
if not output_file.exists():
|
|
print(
|
|
f"FAIL: {output_file} does not exist. "
|
|
"Run 'python scripts/generate_config_docs.py' to generate it."
|
|
)
|
|
return 1
|
|
existing = output_file.read_text()
|
|
if existing == new_content:
|
|
print("OK: Configuration docs are up to date.")
|
|
return 0
|
|
print(
|
|
f"FAIL: {output_file} is out of date. "
|
|
"Run 'python scripts/generate_config_docs.py' to regenerate it."
|
|
)
|
|
return 1
|
|
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(new_content)
|
|
print(f"Wrote {output_file}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate CONFIGURATION.md from defaults"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=Path,
|
|
help="Output file path (default: docs/CONFIGURATION.md)",
|
|
)
|
|
parser.add_argument(
|
|
"--check",
|
|
action="store_true",
|
|
help="Check if docs are up to date (exit 1 if stale)",
|
|
)
|
|
args = parser.parse_args()
|
|
sys.exit(generate_docs(output_path=args.output, check=args.check))
|