local-deep-research/scripts/ci/dump_url_map.py

#!/usr/bin/env python3
"""
Dump the Flask url_map to a file as one absolute URL per line.

Used by the Nuclei DAST workflow to seed a URL list so the scanner
probes the actual application surface (authenticated routes, API
endpoints, blueprints) instead of just the index page.

Parameterized routes (`/research/<string:research_id>/status`) are
emitted with a converter-appropriate placeholder so Nuclei still
exercises the path. The substituted URL will usually 404 (the resource
doesn't exist for the test user), but that is fine — Nuclei probes path
traversal, parameter injection, and SQLi templates against the URL
pattern, not against a specific resource.

Skips:
  - The Flask `static` endpoint (asset serving, no app logic).
  - Routes that don't accept GET — Nuclei templates almost exclusively
    issue GET probes, so POST-only endpoints just generate 405s.

Usage:
    python scripts/ci/dump_url_map.py http://127.0.0.1:5000 > urls.txt
"""

import re
import sys


# Map Flask URL converters to a placeholder that satisfies the converter
# so Flask routes the request to the handler instead of 404-ing at the
# converter stage. Anything not listed falls back to a plain string.
_PLACEHOLDERS = {
    "int": "1",
    "float": "1",
    "uuid": "00000000-0000-0000-0000-000000000000",
}
_DEFAULT_PLACEHOLDER = "nuclei"

_PARAM_RE = re.compile(
    r"<(?:(?P<conv>[a-zA-Z_][a-zA-Z0-9_]*)(?:\([^)]*\))?:)?(?P<name>[a-zA-Z_][a-zA-Z0-9_]*)>"
)


def _substitute(match: "re.Match[str]") -> str:
    return _PLACEHOLDERS.get(match.group("conv") or "", _DEFAULT_PLACEHOLDER)


def main() -> int:
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} BASE_URL", file=sys.stderr)
        return 2

    base_url = sys.argv[1].rstrip("/")

    from local_deep_research.web.app_factory import create_app

    app, _ = create_app()

    seen: set[str] = set()
    for rule in app.url_map.iter_rules():
        if rule.endpoint == "static":
            continue
        if "GET" not in (rule.methods or set()):
            continue
        path = _PARAM_RE.sub(_substitute, rule.rule)
        url = f"{base_url}{path}"
        if url in seen:
            continue
        seen.add(url)
        print(url)

    return 0


if __name__ == "__main__":
    sys.exit(main())