local-deep-research/Dockerfile

####
# Used for building the LDR service dependencies.
####
FROM python:3.14.6-slim@sha256:c79315c9ba2403aecb221fb9090486be9af43cdc2372959ca7ccf6b17ebe9912 AS builder-base

# Set shell to bash with pipefail for safer pipe handling
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

ARG DEBIAN_FRONTEND=noninteractive

# `apt-get upgrade -y` is INTENTIONAL — we want every build to pull the
# latest patched Debian packages so security fixes flow into the image.
# This trades bit-for-bit reproducibility (two rebuilds of the same source
# can produce different layer digests across a Debian patch window) for
# always-fresh-on-CVE behavior. The build-once-promote pipeline mitigates
# the reproducibility loss: prerelease-docker.yml builds once per release
# and the resulting digest is what gets retagged to :1.6.9 / :1.6 / :latest,
# so the released image is bit-identical to the one tested.
# Install system dependencies for SQLCipher and Node.js for frontend build
# Using Acquire::Retries to handle transient Debian mirror errors during CI
RUN apt-get update -o Acquire::Retries=3 && apt-get upgrade -y -o Acquire::Retries=3 \
    && apt-get install -y --no-install-recommends -o Acquire::Retries=3 \
    libsqlcipher-dev \
    sqlcipher \
    libsqlcipher1 \
    build-essential \
    pkg-config \
    curl \
    ca-certificates \
    gnupg \
    # Add NodeSource GPG key and repository directly (pinned to Node.js 24.x LTS)
    # GPG key fingerprint verification for supply chain security
    # Key: NSolid <nsolid-gpg@nodesource.com> (RSA 2048-bit, created 2016-05-23)
    # Fingerprint verified from: https://github.com/nodesource/distributions
    # If key rotates, update NODESOURCE_GPG_FINGERPRINT and verify new key at:
    # https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key
    && NODESOURCE_GPG_FINGERPRINT="6F71F525282841EEDAF851B42F59B5F99B1BE0B4" \
    && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource.gpg.key \
    && ACTUAL_FINGERPRINT=$(gpg --with-fingerprint --with-colons --show-keys /tmp/nodesource.gpg.key 2>/dev/null | grep "^fpr" | head -1 | cut -d: -f10) \
    && if [ "$ACTUAL_FINGERPRINT" != "$NODESOURCE_GPG_FINGERPRINT" ]; then \
         echo "ERROR: NodeSource GPG key fingerprint mismatch!" >&2; \
         echo "Expected: $NODESOURCE_GPG_FINGERPRINT" >&2; \
         echo "Actual:   $ACTUAL_FINGERPRINT" >&2; \
         echo "The NodeSource signing key may have been rotated or compromised." >&2; \
         echo "Verify the new key and update NODESOURCE_GPG_FINGERPRINT if valid." >&2; \
         exit 1; \
       fi \
    && gpg --batch --dearmor -o /usr/share/keyrings/nodesource.gpg /tmp/nodesource.gpg.key \
    && rm /tmp/nodesource.gpg.key \
    && echo "deb [signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_24.x nodistro main" > /etc/apt/sources.list.d/nodesource.list \
    && apt-get update \
    && apt-get install -y --no-install-recommends nodejs \
    && rm -rf /var/lib/apt/lists/*

# Install dependencies and tools (pinned versions for reproducibility)
# Pin pip, pdm, and playwright to specific versions for OSSF Scorecard compliance
# Note: hishel<1.0.0 is required due to https://github.com/pdm-project/pdm/issues/3657
# Note: wheel>=0.46.2 is required for CVE-2026-24049 fix (path traversal)
# Note: Scorecard's Pinned-Dependencies rule additionally wants per-package
# --hash= or --require-hashes here. We considered hash-pinning the bootstrap
# layer but rejected it: hash-locking pip itself has repeatedly broken
# rebuilds (mirror/wheel-tag drift across CI runs). The base image's pip is
# already verified, and we re-pin to a CVE-fixed version immediately. The
# three resulting Scorecard alerts (#7740, #7741, #7742) are dismissed as
# won't-fix; revisit if a stable hash-locking workflow becomes available.
RUN pip3 install --no-cache-dir pip==26.1 \
    && pip install --no-cache-dir pdm==2.26.2 "hishel<1.0.0" playwright==1.58.0 "wheel>=0.46.2"
# disable update check
ENV PDM_CHECK_UPDATE=false
# Increase PDM request timeout from default 15s to 120s for large packages (numpy, torch)
# This helps prevent httpcore.ReadTimeout errors during CI network congestion
ENV PDM_REQUEST_TIMEOUT=120

# NOTE: `DEPS_HASH` was previously declared as a cache-invalidation arg but
# never referenced in a RUN/COPY, so it had no effect — Docker only honors
# ARG values for cache when they're actually used downstream. Cache
# invalidation on dependency changes happens naturally via `COPY pdm.lock`
# below, since the file's content hash changes when deps change.
WORKDIR /install

# Copy dependency files first (changes rarely)
COPY pyproject.toml pyproject.toml
COPY pdm.lock pdm.lock
COPY LICENSE LICENSE
COPY README.md README.md

# Copy frontend build files
COPY package.json package.json
COPY package-lock.json* package-lock.json
COPY vite.config.js vite.config.js

# Source files last (changes most frequently). Note: with the current layout,
# caching benefit is limited because all RUN commands (npm ci, npm run build,
# pdm install) live in the builder stage which rebuilds when builder-base changes.
# This ordering is still good practice for Dockerfile maintainability.
COPY src/ src

####
# Builds the LDR service dependencies used in production.
####
FROM builder-base AS builder

# Install npm dependencies, build frontend, and install Python dependencies
# PDM will automatically select the correct SQLCipher package based on platform
# Using npm ci for reproducible builds with lockfile integrity verification
# These RUNs are separate for caching
RUN npm ci
RUN npm run build
RUN for i in 1 2 3; do \
      if pdm install --prod --no-editable; then \
        break; \
      else \
        echo "PDM install attempt $i failed, retrying in 15s..."; \
        sleep 15; \
      fi; \
    done


####
# Container for running tests.
####
FROM builder-base AS ldr-test

# Set shell to bash with pipefail for safer pipe handling
# Note: Explicitly set even though inherited from builder-base for hadolint static analysis
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

ARG DEBIAN_FRONTEND=noninteractive

# Install additional runtime dependencies for testing tools
# Note: Node.js is already installed from builder-base
# Using Acquire::Retries to handle transient Debian mirror errors during CI
# `apt-get upgrade -y` is INTENTIONAL — see the rationale comment on the
# corresponding upgrade in the builder-base stage (top of file).
RUN apt-get update -o Acquire::Retries=3 && apt-get upgrade -y -o Acquire::Retries=3 \
    && apt-get install -y --no-install-recommends -o Acquire::Retries=3 \
    xauth \
    xvfb \
    # Dependencies for Chromium
    fonts-liberation \
    libasound2 \
    libatk-bridge2.0-0 \
    libatk1.0-0 \
    libatspi2.0-0 \
    libcups2 \
    libdbus-1-3 \
    libdrm2 \
    libgbm1 \
    libgtk-3-0 \
    libnspr4 \
    libnss3 \
    libxcomposite1 \
    libxdamage1 \
    libxfixes3 \
    libxkbcommon0 \
    libxrandr2 \
    xdg-utils \
    && rm -rf /var/lib/apt/lists/*

# Set up Puppeteer environment
ENV PUPPETEER_CACHE_DIR=/app/puppeteer-cache
ENV DOCKER_ENV=true
# Don't skip Chrome download - let Puppeteer download its own Chrome as fallback
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true

# Create puppeteer cache directory with proper permissions
RUN mkdir -p /app/puppeteer-cache && chmod -R 755 /app/puppeteer-cache

# Install Playwright with Chromium first (before npm packages)
RUN playwright install --with-deps chromium || echo "Playwright install failed, will use Puppeteer's Chrome"

# Copy test package files and lockfiles for npm ci
COPY tests/api_tests_with_login/package.json tests/api_tests_with_login/package-lock.json /install/tests/api_tests_with_login/
COPY tests/ui_tests/package.json tests/ui_tests/package-lock.json /install/tests/ui_tests/
COPY tests/accessibility_tests/package.json tests/accessibility_tests/package-lock.json /install/tests/accessibility_tests/

# Install npm packages - Skip Puppeteer Chrome download since we have Playwright's Chrome
WORKDIR /install/tests/api_tests_with_login
ENV PUPPETEER_SKIP_DOWNLOAD=true
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done
WORKDIR /install/tests/ui_tests
RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done
WORKDIR /install/tests/accessibility_tests
RUN for i in 1 2 3; do if npm ci; then break; else echo "npm ci attempt $i failed, retrying..."; sleep 5; fi; done

# Install Node.js Playwright browsers (version may differ from Python playwright)
RUN npx playwright install chromium

# Create a stable symlink to Chrome for Puppeteer/Lighthouse.
# Use the Playwright JavaScript API (chromium.executablePath()) to resolve the
# exact binary path from the installed Node.js Playwright version, avoiding
# hard-coded revision directories that change across releases.
RUN CHROME_PATH=$(node -e "console.log(require('playwright-core').chromium.executablePath())") && \
    if [ -n "$CHROME_PATH" ] && [ -x "$CHROME_PATH" ]; then \
        echo "Symlinking Chrome from: $CHROME_PATH"; \
        ln -sf "$CHROME_PATH" /usr/local/bin/chrome; \
    else \
        echo "WARNING: No Chrome binary found at $CHROME_PATH"; \
    fi

# Set environment variables for Puppeteer to use Playwright's Chrome
ENV PUPPETEER_SKIP_DOWNLOAD=true
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
ENV PUPPETEER_EXECUTABLE_PATH=/usr/local/bin/chrome

# Copy test files to /app where they will be run from
RUN mkdir -p /app && cp -r /install/tests /app/

# Ensure Chrome binaries have correct permissions
RUN chmod -R 755 /app/puppeteer-cache

WORKDIR /install

# Copy Vite build artifacts from builder stage so bundled CSS/JS are available.
# styles.css is only loaded via Vite (imported in app.js), so without the dist/
# directory the page renders without layout CSS, causing a11y test failures.
COPY --from=builder /install/src/local_deep_research/web/static/dist/ /install/src/local_deep_research/web/static/dist/

# Install the package using PDM
# PDM will automatically select the correct SQLCipher package based on platform
RUN pdm install --no-editable

# Mirror of the chmod in the `ldr` stage (see comment there). The ldr-test
# stage does its own pdm install instead of COPYing the venv from `builder`,
# so it doesn't inherit that fix and would otherwise trip
# _validate_migrations_permissions on every login during UI tests.
RUN find /install/.venv -type d -path '*/local_deep_research/database/migrations' \
        -exec chmod -R go-w {} +

# Configure path to default to the venv python.
ENV PATH="/install/.venv/bin:$PATH"

# Note: Test container runs as root because CI workflows mount source code
# volumes that are owned by root. The production container (ldr) runs as
# non-root user for security.

####
# Runs the LDR service.
###
FROM python:3.14.6-slim@sha256:c79315c9ba2403aecb221fb9090486be9af43cdc2372959ca7ccf6b17ebe9912 AS ldr

# Set shell to bash with pipefail for safer pipe handling
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

ARG DEBIAN_FRONTEND=noninteractive

# Upgrade pip to fix CVE-2026-1703 (malicious wheel extraction) + GHSA-jp4c-xjxw-mgf9
# See builder-stage rationale above for why this install is not hash-pinned
# — Scorecard alert #7742 dismissed as won't-fix on the same basis.
RUN pip3 install --no-cache-dir pip==26.1

# Install runtime dependencies for SQLCipher and WeasyPrint.
# `apt-get upgrade -y` is INTENTIONAL — see rationale on the builder-base
# upgrade (top of file). Trade reproducibility for always-fresh CVE patches.
RUN apt-get update && apt-get upgrade -y \
    && apt-get install -y --no-install-recommends \
    sqlcipher \
    libsqlcipher1 \
    # setpriv (from util-linux, already in base image) handles user switching
    # in the entrypoint — no additional package needed
    #
    # WeasyPrint dependencies for PDF generation
    libcairo2 \
    libpango-1.0-0 \
    libpangocairo-1.0-0 \
    libgdk-pixbuf-2.0-0 \
    libffi-dev \
    shared-mime-info \
    # GLib and GObject dependencies (libgobject is included in libglib2.0-0)
    libglib2.0-0 \
    # CJK fonts so WeasyPrint can render Chinese/Japanese/Korean glyphs
    # in exported PDFs — without these the slim base image has no CJK
    # coverage and CJK text vanishes from the output (issue #4055).
    fonts-noto-cjk \
    && rm -rf /var/lib/apt/lists/*

# Create non-root user for running service (security best practice)
RUN groupadd -r ldruser && useradd -r -g ldruser -u 1000 -m -d /home/ldruser ldruser

# Create directories with proper permissions for non-root user
RUN mkdir -p /app/.config/local_deep_research /home/ldruser/.local/share && \
    chown -R ldruser:ldruser /app /home/ldruser && \
    chmod -R 755 /app /home/ldruser

# retrieve packages from build stage
COPY --chown=ldruser:ldruser --from=builder /install/.venv/ /install/.venv
ENV PATH="/install/.venv/bin:$PATH"

# Strip world-write bit from the migrations subtree. The runtime check in
# alembic_runner._validate_migrations_permissions refuses to run migrations
# if anything under migrations/versions/ is world-writable, and pip/pdm can
# leave permissive modes on the dir entries depending on the build host's
# umask (see pip#8164, conda#12829). Without this normalisation a per-user
# DB silently stays at its previous Alembic revision on every login, which
# manifests downstream as e.g. "no such table: papers" on academic-source
# saves. Targeted at the migrations subtree only — we deliberately avoid
# blanket-chmoding the venv.
RUN find /install/.venv -type d -path '*/local_deep_research/database/migrations' \
        -exec chmod -R go-w {} +

# Verify SQLCipher as ldruser via setpriv.
# Running as ldruser ensures Python __pycache__ files created during import
# are owned by ldruser. Browser binaries are NOT installed in the production
# image — Playwright is only used for testing (ldr-test stage).
RUN HOME=/home/ldruser setpriv --reuid=ldruser --regid=ldruser --init-groups -- \
    python -c "from local_deep_research.database.sqlcipher_compat import get_sqlcipher_module; \
    sqlcipher = get_sqlcipher_module(); \
    print(f'✓ SQLCipher module loaded successfully: {sqlcipher}')"

# Persistent state. Without VOLUME directives the user loses all research
# data + DBs on `docker rm`. Recommend bind-mounting these in production.
# - /app/.config/local_deep_research: legacy config path (kept for backcompat)
# - /data: where the entrypoint creates logs/, cache/, encrypted_databases/ —
#   the actual user state, see scripts/ldr_entrypoint.sh.
#
# LDR_DATA_DIR pins the application to /data. Without this, the Python
# code falls back to platformdirs.user_data_dir() which resolves to
# /home/ldruser/.local/share/local-deep-research — NOT under any
# declared VOLUME, so a `docker run -v vol:/data ...` user (without
# also setting -e LDR_DATA_DIR=/data) would silently lose all data on
# `docker rm`. Documented run paths (docker-compose.yml, README docker
# run examples) already pass this env var explicitly; setting it here
# makes the VOLUME actually load-bearing for bare `docker run -v ...`
# invocations too.
ENV LDR_DATA_DIR=/data
VOLUME /app/.config/local_deep_research
VOLUME /data

# NOTE: /scripts/ is image content (ollama entrypoint baked in below), NOT
# user state. Previously declared as VOLUME, but a VOLUME on a directory
# that the image populates causes anonymous-volume creation on every
# `docker run` and silently shadows the script if a user bind-mounts it.
# Removed for correctness.
COPY --chown=ldruser:ldruser scripts/ollama_entrypoint.sh /scripts/ollama_entrypoint.sh

# Copy LDR entrypoint script to handle volume permissions
COPY scripts/ldr_entrypoint.sh /usr/local/bin/ldr_entrypoint.sh

# COPY --chown sets ownership on copied contents, but Docker auto-creates
# parent dirs (/install, /scripts) as root. Fix with non-recursive chown
# (fast — avoids walking 500MB+ of venv files that are already ldruser-owned).
RUN chmod +x /scripts/ollama_entrypoint.sh \
    && chmod +x /usr/local/bin/ldr_entrypoint.sh \
    && chown ldruser:ldruser /install /scripts

EXPOSE 5000

# Health check for container orchestration (Docker, Kubernetes, etc.)
# The ``timeout=8`` on urlopen is load-bearing: without it, urllib hangs forever
# on a slow/blocked server. Docker's --timeout=10s only SIGKILLs the ``sh -c``
# wrapper; the python child gets reparented to PID 1 and continues holding a
# TCP socket open against the app (one pidfd per hung child on PID 1). With
# ``timeout=8`` the child returns/raises before Docker's wall, exits cleanly,
# and gets reaped.
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:5000/api/v1/health', timeout=8)" || exit 1

STOPSIGNAL SIGINT

# Use entrypoint to fix volume permissions, then switch to ldruser
# The entrypoint runs as root to fix /data permissions, then drops to ldruser
ENTRYPOINT ["/usr/local/bin/ldr_entrypoint.sh"]

# Use PDM to run the application (passed to entrypoint as $@)
CMD [ "ldr-web" ]