#### # Used for building the LDR service dependencies. #### FROM python:3.14.5-slim@sha256:7a500125bc50693f2214e842a621440a1b1b9cbb2188f74ab045d29ed2ea5856 AS builder-base # Set shell to bash with pipefail for safer pipe handling SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG DEBIAN_FRONTEND=noninteractive # Install system dependencies for SQLCipher and Node.js for frontend build # Using Acquire::Retries to handle transient Debian mirror errors during CI RUN apt-get update -o Acquire::Retries=3 && apt-get upgrade -y -o Acquire::Retries=3 \ && apt-get install -y --no-install-recommends -o Acquire::Retries=3 \ libsqlcipher-dev \ sqlcipher \ libsqlcipher1 \ build-essential \ pkg-config \ curl \ ca-certificates \ gnupg \ # Add NodeSource GPG key and repository directly (pinned to Node.js 24.x LTS) # GPG key fingerprint verification for supply chain security # Key: NSolid (RSA 2048-bit, created 2016-05-23) # Fingerprint verified from: https://github.com/nodesource/distributions # If key rotates, update NODESOURCE_GPG_FINGERPRINT and verify new key at: # https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key && NODESOURCE_GPG_FINGERPRINT="6F71F525282841EEDAF851B42F59B5F99B1BE0B4" \ && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource.gpg.key \ && ACTUAL_FINGERPRINT=$(gpg --with-fingerprint --with-colons --show-keys /tmp/nodesource.gpg.key 2>/dev/null | grep "^fpr" | head -1 | cut -d: -f10) \ && if [ "$ACTUAL_FINGERPRINT" != "$NODESOURCE_GPG_FINGERPRINT" ]; then \ echo "ERROR: NodeSource GPG key fingerprint mismatch!" >&2; \ echo "Expected: $NODESOURCE_GPG_FINGERPRINT" >&2; \ echo "Actual: $ACTUAL_FINGERPRINT" >&2; \ echo "The NodeSource signing key may have been rotated or compromised." >&2; \ echo "Verify the new key and update NODESOURCE_GPG_FINGERPRINT if valid." >&2; \ exit 1; \ fi \ && gpg --batch --dearmor -o /usr/share/keyrings/nodesource.gpg /tmp/nodesource.gpg.key \ && rm /tmp/nodesource.gpg.key \ && echo "deb [signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_24.x nodistro main" > /etc/apt/sources.list.d/nodesource.list \ && apt-get update \ && apt-get install -y --no-install-recommends nodejs \ && rm -rf /var/lib/apt/lists/* # Install dependencies and tools (pinned versions for reproducibility) # Pin pip, pdm, and playwright to specific versions for OSSF Scorecard compliance # Note: hishel<1.0.0 is required due to https://github.com/pdm-project/pdm/issues/3657 # Note: wheel>=0.46.2 is required for CVE-2026-24049 fix (path traversal) # Note: Scorecard's Pinned-Dependencies rule additionally wants per-package # --hash= or --require-hashes here. We considered hash-pinning the bootstrap # layer but rejected it: hash-locking pip itself has repeatedly broken # rebuilds (mirror/wheel-tag drift across CI runs). The base image's pip is # already verified, and we re-pin to a CVE-fixed version immediately. The # three resulting Scorecard alerts (#7740, #7741, #7742) are dismissed as # won't-fix; revisit if a stable hash-locking workflow becomes available. RUN pip3 install --no-cache-dir pip==26.1 \ && pip install --no-cache-dir pdm==2.26.2 "hishel<1.0.0" playwright==1.58.0 "wheel>=0.46.2" # disable update check ENV PDM_CHECK_UPDATE=false # Increase PDM request timeout from default 15s to 120s for large packages (numpy, torch) # This helps prevent httpcore.ReadTimeout errors during CI network congestion ENV PDM_REQUEST_TIMEOUT=120 # Build argument to invalidate cache when dependencies change ARG DEPS_HASH WORKDIR /install # Copy dependency files first (changes rarely) COPY pyproject.toml pyproject.toml COPY pdm.lock pdm.lock COPY LICENSE LICENSE COPY README.md README.md # Copy frontend build files COPY package.json package.json COPY package-lock.json* package-lock.json COPY vite.config.js vite.config.js # Source files last (changes most frequently). Note: with the current layout, # caching benefit is limited because all RUN commands (npm ci, npm run build, # pdm install) live in the builder stage which rebuilds when builder-base changes. # This ordering is still good practice for Dockerfile maintainability. COPY src/ src #### # Builds the LDR service dependencies used in production. #### FROM builder-base AS builder # Install npm dependencies, build frontend, and install Python dependencies # PDM will automatically select the correct SQLCipher package based on platform # Using npm ci for reproducible builds with lockfile integrity verification # These RUNs are separate for caching RUN npm ci RUN npm run build RUN for i in 1 2 3; do \ if pdm install --prod --no-editable; then \ break; \ else \ echo "PDM install attempt $i failed, retrying in 15s..."; \ sleep 15; \ fi; \ done #### # Container for running tests. #### FROM builder-base AS ldr-test # Set shell to bash with pipefail for safer pipe handling # Note: Explicitly set even though inherited from builder-base for hadolint static analysis SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG DEBIAN_FRONTEND=noninteractive # Install additional runtime dependencies for testing tools # Note: Node.js is already installed from builder-base # Using Acquire::Retries to handle transient Debian mirror errors during CI RUN apt-get update -o Acquire::Retries=3 && apt-get upgrade -y -o Acquire::Retries=3 \ && apt-get install -y --no-install-recommends -o Acquire::Retries=3 \ xauth \ xvfb \ # Dependencies for Chromium fonts-liberation \ libasound2 \ libatk-bridge2.0-0 \ libatk1.0-0 \ libatspi2.0-0 \ libcups2 \ libdbus-1-3 \ libdrm2 \ libgbm1 \ libgtk-3-0 \ libnspr4 \ libnss3 \ libxcomposite1 \ libxdamage1 \ libxfixes3 \ libxkbcommon0 \ libxrandr2 \ xdg-utils \ && rm -rf /var/lib/apt/lists/* # Set up Puppeteer environment ENV PUPPETEER_CACHE_DIR=/app/puppeteer-cache ENV DOCKER_ENV=true # Don't skip Chrome download - let Puppeteer download its own Chrome as fallback # ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true # Create puppeteer cache directory with proper permissions RUN mkdir -p /app/puppeteer-cache && chmod -R 755 /app/puppeteer-cache # Install Playwright with Chromium first (before npm packages) RUN playwright install --with-deps chromium || echo "Playwright install failed, will use Puppeteer's Chrome" # Copy test package files and lockfiles for npm ci COPY tests/api_tests_with_login/package.json tests/api_tests_with_login/package-lock.json /install/tests/api_tests_with_login/ COPY tests/ui_tests/package.json tests/ui_tests/package-lock.json /install/tests/ui_tests/ COPY tests/accessibility_tests/package.json tests/accessibility_tests/package-lock.json /install/tests/accessibility_tests/ # Install npm packages - Skip Puppeteer Chrome download since we have Playwright's Chrome WORKDIR /install/tests/api_tests_with_login ENV PUPPETEER_SKIP_DOWNLOAD=true ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done WORKDIR /install/tests/ui_tests RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done WORKDIR /install/tests/accessibility_tests RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done # Install Node.js Playwright browsers (version may differ from Python playwright) RUN npx playwright install chromium # Create a stable symlink to Chrome for Puppeteer/Lighthouse. # Use the Playwright JavaScript API (chromium.executablePath()) to resolve the # exact binary path from the installed Node.js Playwright version, avoiding # hard-coded revision directories that change across releases. RUN CHROME_PATH=$(node -e "console.log(require('playwright-core').chromium.executablePath())") && \ if [ -n "$CHROME_PATH" ] && [ -x "$CHROME_PATH" ]; then \ echo "Symlinking Chrome from: $CHROME_PATH"; \ ln -sf "$CHROME_PATH" /usr/local/bin/chrome; \ else \ echo "WARNING: No Chrome binary found at $CHROME_PATH"; \ fi # Set environment variables for Puppeteer to use Playwright's Chrome ENV PUPPETEER_SKIP_DOWNLOAD=true ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true ENV PUPPETEER_EXECUTABLE_PATH=/usr/local/bin/chrome # Copy test files to /app where they will be run from RUN mkdir -p /app && cp -r /install/tests /app/ # Ensure Chrome binaries have correct permissions RUN chmod -R 755 /app/puppeteer-cache WORKDIR /install # Copy Vite build artifacts from builder stage so bundled CSS/JS are available. # styles.css is only loaded via Vite (imported in app.js), so without the dist/ # directory the page renders without layout CSS, causing a11y test failures. COPY --from=builder /install/src/local_deep_research/web/static/dist/ /install/src/local_deep_research/web/static/dist/ # Install the package using PDM # PDM will automatically select the correct SQLCipher package based on platform RUN pdm install --no-editable # Mirror of the chmod in the `ldr` stage (see comment there). The ldr-test # stage does its own pdm install instead of COPYing the venv from `builder`, # so it doesn't inherit that fix and would otherwise trip # _validate_migrations_permissions on every login during UI tests. RUN find /install/.venv -type d -path '*/local_deep_research/database/migrations' \ -exec chmod -R go-w {} + # Configure path to default to the venv python. ENV PATH="/install/.venv/bin:$PATH" # Note: Test container runs as root because CI workflows mount source code # volumes that are owned by root. The production container (ldr) runs as # non-root user for security. #### # Runs the LDR service. ### FROM python:3.14.5-slim@sha256:7a500125bc50693f2214e842a621440a1b1b9cbb2188f74ab045d29ed2ea5856 AS ldr # Set shell to bash with pipefail for safer pipe handling SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG DEBIAN_FRONTEND=noninteractive # Upgrade pip to fix CVE-2026-1703 (malicious wheel extraction) + GHSA-jp4c-xjxw-mgf9 # See builder-stage rationale above for why this install is not hash-pinned # — Scorecard alert #7742 dismissed as won't-fix on the same basis. RUN pip3 install --no-cache-dir pip==26.1 # Install runtime dependencies for SQLCipher and WeasyPrint RUN apt-get update && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ sqlcipher \ libsqlcipher1 \ # setpriv (from util-linux, already in base image) handles user switching # in the entrypoint — no additional package needed # # WeasyPrint dependencies for PDF generation libcairo2 \ libpango-1.0-0 \ libpangocairo-1.0-0 \ libgdk-pixbuf-2.0-0 \ libffi-dev \ shared-mime-info \ # GLib and GObject dependencies (libgobject is included in libglib2.0-0) libglib2.0-0 \ # CJK fonts so WeasyPrint can render Chinese/Japanese/Korean glyphs # in exported PDFs — without these the slim base image has no CJK # coverage and CJK text vanishes from the output (issue #4055). fonts-noto-cjk \ && rm -rf /var/lib/apt/lists/* # Create non-root user for running service (security best practice) RUN groupadd -r ldruser && useradd -r -g ldruser -u 1000 -m -d /home/ldruser ldruser # Create directories with proper permissions for non-root user RUN mkdir -p /app/.config/local_deep_research /home/ldruser/.local/share && \ chown -R ldruser:ldruser /app /home/ldruser && \ chmod -R 755 /app /home/ldruser # retrieve packages from build stage COPY --chown=ldruser:ldruser --from=builder /install/.venv/ /install/.venv ENV PATH="/install/.venv/bin:$PATH" # Strip world-write bit from the migrations subtree. The runtime check in # alembic_runner._validate_migrations_permissions refuses to run migrations # if anything under migrations/versions/ is world-writable, and pip/pdm can # leave permissive modes on the dir entries depending on the build host's # umask (see pip#8164, conda#12829). Without this normalisation a per-user # DB silently stays at its previous Alembic revision on every login, which # manifests downstream as e.g. "no such table: papers" on academic-source # saves. Targeted at the migrations subtree only — we deliberately avoid # blanket-chmoding the venv. RUN find /install/.venv -type d -path '*/local_deep_research/database/migrations' \ -exec chmod -R go-w {} + # Verify SQLCipher as ldruser via setpriv. # Running as ldruser ensures Python __pycache__ files created during import # are owned by ldruser. Browser binaries are NOT installed in the production # image — Playwright is only used for testing (ldr-test stage). RUN HOME=/home/ldruser setpriv --reuid=ldruser --regid=ldruser --init-groups -- \ python -c "from local_deep_research.database.sqlcipher_compat import get_sqlcipher_module; \ sqlcipher = get_sqlcipher_module(); \ print(f'✓ SQLCipher module loaded successfully: {sqlcipher}')" # Create volume for persistent configuration # Use /app for configuration to support non-root user VOLUME /app/.config/local_deep_research # Create volume for Ollama start script VOLUME /scripts/ # Copy the Ollama entrypoint script COPY --chown=ldruser:ldruser scripts/ollama_entrypoint.sh /scripts/ollama_entrypoint.sh # Copy LDR entrypoint script to handle volume permissions COPY scripts/ldr_entrypoint.sh /usr/local/bin/ldr_entrypoint.sh # COPY --chown sets ownership on copied contents, but Docker auto-creates # parent dirs (/install, /scripts) as root. Fix with non-recursive chown # (fast — avoids walking 500MB+ of venv files that are already ldruser-owned). RUN chmod +x /scripts/ollama_entrypoint.sh \ && chmod +x /usr/local/bin/ldr_entrypoint.sh \ && chown ldruser:ldruser /install /scripts EXPOSE 5000 # Health check for container orchestration (Docker, Kubernetes, etc.) # The ``timeout=8`` on urlopen is load-bearing: without it, urllib hangs forever # on a slow/blocked server. Docker's --timeout=10s only SIGKILLs the ``sh -c`` # wrapper; the python child gets reparented to PID 1 and continues holding a # TCP socket open against the app (one pidfd per hung child on PID 1). With # ``timeout=8`` the child returns/raises before Docker's wall, exits cleanly, # and gets reaped. HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:5000/api/v1/health', timeout=8)" || exit 1 STOPSIGNAL SIGINT # Use entrypoint to fix volume permissions, then switch to ldruser # The entrypoint runs as root to fix /data permissions, then drops to ldruser ENTRYPOINT ["/usr/local/bin/ldr_entrypoint.sh"] # Use PDM to run the application (passed to entrypoint as $@) CMD [ "ldr-web" ]