LibreChat/.github/workflows/gitnexus-deploy-do.yml

# Deploys GitNexus indexes to a DigitalOcean droplet via SSH + rsync.
#
# Architecture:
#   GitHub Actions (deploy)
#     1. Resolves latest successful index runs for main, dev, and every
#        open PR that already has an index artifact (contributor-gated
#        upstream by the index workflow's author_association check)
#     2. Downloads each matching .gitnexus/ artifact
#     3. Rsyncs them into /opt/gitnexus/indexes/<name>/ on the droplet
#     4. Removes any stale folders on the droplet for PRs that closed
#        (even though gitnexus-cleanup-pr.yml also handles that path,
#        this is a safety net in case the close event was missed)
#     5. Pulls latest image, force-recreates gitnexus, reloads Caddy,
#        and polls docker health until the container reports healthy
#   The caddy container is untouched — no TLS churn.
#
# First-time droplet bootstrap (run once, manually):
#   1. Create 2GB+ Ubuntu 24.04 droplet, add SSH key
#   2. Point DNS A record for your subdomain at the droplet IP
#   3. SSH in and run:
#        curl -fsSL https://get.docker.com | sh
#        systemctl enable --now docker
#        mkdir -p /opt/gitnexus/indexes
#        useradd -m -s /bin/bash deploy
#        usermod -aG docker deploy
#        mkdir -p /home/deploy/.ssh
#        # Add deploy pubkey to /home/deploy/.ssh/authorized_keys
#        chown -R deploy:deploy /home/deploy/.ssh /opt/gitnexus
#        chmod 700 /home/deploy/.ssh
#        ufw allow 22,80,443/tcp
#        ufw --force enable
#   4. Copy .do/gitnexus/docker-compose.yml and Caddyfile into /opt/gitnexus/
#   5. Create /opt/gitnexus/.env with: GITNEXUS_DOMAIN=... and API_TOKEN=...
#   6. cd /opt/gitnexus && docker compose up -d
#
#   Then capture the droplet's SSH host key from your workstation and
#   save it as the GITNEXUS_DO_KNOWN_HOST secret (below) so CI can pin it:
#     ssh-keyscan -H gitnexus.yourdomain.com
#
#   GHCR image: the workflow runs `docker login ghcr.io` on the droplet
#   on every deploy using GITHUB_TOKEN, so the package can stay private.
#   If you'd rather not have CI manage droplet auth, make the package
#   public under repo Settings -> Packages.
#
# Required GitHub secrets:
#   GITNEXUS_DO_HOST        — droplet IP or hostname
#   GITNEXUS_DO_USER        — SSH user (e.g. "deploy")
#   GITNEXUS_DO_SSH_KEY     — private key matching the authorized pubkey
#   GITNEXUS_DO_KNOWN_HOST  — output of `ssh-keyscan -H <host>` pinning the
#                             droplet's host keys (prevents MITM/TOFU risk)

name: GitNexus Deploy (DigitalOcean)

on:
  workflow_run:
    workflows: ['GitNexus Index']
    types: [completed]
  workflow_dispatch:

permissions:
  actions: read
  contents: read
  pull-requests: read

# Per-ref concurrency: rapid pushes to the same branch/PR coalesce, but
# deploys targeting different refs can still run in parallel. Safe because
# each deploy only rsync's its own folder; final step always reflects the
# latest state for all refs discovered at resolve time.
concurrency:
  group: gitnexus-deploy-do-${{ github.event.workflow_run.head_branch || github.ref }}
  cancel-in-progress: true

env:
  GITNEXUS_VERSION: '1.5.3'
  IMAGE_NAME: ghcr.io/${{ github.repository_owner }}/librechat-gitnexus

jobs:
  # Rebuilds the long-lived image only when Dockerfile/entrypoint/extensions
  # change. Skipped on every other run, so index-only deploys are fast.
  build-image:
    if: |
      github.event_name == 'workflow_dispatch' ||
      github.event.workflow_run.conclusion == 'success'
    runs-on: ubuntu-latest
    timeout-minutes: 20
    permissions:
      contents: read
      packages: write # push image to GHCR
    outputs:
      image_tag: ${{ steps.tag.outputs.value }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 2

      - name: Detect image changes
        id: changes
        run: |
          # Default to rebuild when we can't cleanly diff (first commit,
          # workflow_run from a PR branch where HEAD isn't the trigger, etc).
          # Rebuild on miss > skip when we should have rebuilt.
          if git rev-parse --verify HEAD~1 >/dev/null 2>&1 && \
             git diff --quiet HEAD~1 HEAD -- .do/gitnexus/Dockerfile .do/gitnexus/entrypoint.sh .do/gitnexus/install-extensions.js; then
            echo "changed=false" >> "$GITHUB_OUTPUT"
          else
            echo "changed=true" >> "$GITHUB_OUTPUT"
          fi

      - name: Compute image tag
        id: tag
        run: echo "value=v${{ env.GITNEXUS_VERSION }}" >> "$GITHUB_OUTPUT"

      - name: Log in to GHCR
        if: steps.changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Build and push image
        if: steps.changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
        uses: docker/build-push-action@v5
        with:
          context: .do/gitnexus
          file: .do/gitnexus/Dockerfile
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:latest
            ${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.value }}
          build-args: |
            GITNEXUS_VERSION=${{ env.GITNEXUS_VERSION }}
          cache-from: type=gha
          cache-to: type=gha,mode=max

  deploy:
    needs: build-image
    runs-on: ubuntu-latest
    timeout-minutes: 20
    permissions:
      actions: read
      contents: read
      pull-requests: read
    steps:
      - name: Checkout deploy config
        uses: actions/checkout@v4
        with:
          sparse-checkout: .do/gitnexus
          fetch-depth: 1

      # Resolve every index to serve. For main/dev this is simple: latest
      # successful run per branch. For PRs, we list artifacts across recent
      # workflow runs, match gitnexus-index-pr-<N>, then cross-reference
      # GitHub PR state and only keep artifacts whose PR is still open.
      - name: Resolve indexes to serve
        id: resolve
        uses: actions/github-script@v7
        with:
          script: |
            const serve = []; // [{ name, artifactName, runId }]

            // --- main and dev branches ---
            for (const branch of ['main', 'dev']) {
              const { data } = await github.rest.actions.listWorkflowRuns({
                owner: context.repo.owner,
                repo: context.repo.repo,
                workflow_id: 'gitnexus-index.yml',
                branch,
                status: 'success',
                per_page: 1,
              });
              if (data.workflow_runs.length) {
                const runId = data.workflow_runs[0].id;
                const name = branch === 'main' ? 'LibreChat' : `LibreChat-${branch}`;
                serve.push({ name, artifactName: `gitnexus-index-${branch}`, runId });
                core.info(`${branch}: run ${runId} -> ${name}`);
              } else {
                core.warning(`No successful index run found for ${branch}`);
              }
            }

            // --- open PRs with at least one successful index run ---
            const { data: openPrs } = await github.rest.pulls.list({
              owner: context.repo.owner,
              repo: context.repo.repo,
              state: 'open',
              per_page: 100,
            });
            core.info(`Found ${openPrs.length} open PRs`);
            if (openPrs.length === 100) {
              core.warning(
                'Open PR list was truncated at 100 (GitHub API maximum). ' +
                  'Some PR indexes may be skipped. Add pagination if the repo ' +
                  'regularly exceeds 100 concurrent open PRs.',
              );
            }

            for (const pr of openPrs) {
              // PR branches live on forks too. listWorkflowRuns for a fork
              // branch name doesn't return anything useful, so we instead
              // query artifacts directly filtered by name.
              const artifactName = `gitnexus-index-pr-${pr.number}`;
              const { data: arts } = await github.rest.actions.listArtifactsForRepo({
                owner: context.repo.owner,
                repo: context.repo.repo,
                name: artifactName,
                per_page: 5,
              });
              // Pick the most recent non-expired artifact
              const fresh = arts.artifacts
                .filter((a) => !a.expired)
                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
              if (!fresh) continue;
              serve.push({
                name: `LibreChat-pr-${pr.number}`,
                artifactName,
                runId: fresh.workflow_run.id,
              });
              core.info(`PR #${pr.number}: run ${fresh.workflow_run.id} -> LibreChat-pr-${pr.number}`);
            }

            if (!serve.length) {
              core.setFailed('No indexes to serve');
              return;
            }

            core.setOutput('matrix', JSON.stringify(serve));
            core.setOutput('active_names', serve.map((s) => s.name).join(','));

      - name: Download each index artifact
        env:
          MATRIX: ${{ steps.resolve.outputs.matrix }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          set -e
          mkdir -p staging
          # main/dev artifact download failures are fatal — a missing
          # main/dev index is a real deploy failure. PR artifact failures
          # are soft — a PR artifact deleted mid-deploy shouldn't abort
          # the whole deploy and take main/dev down with it.
          echo "$MATRIX" | jq -c '.[]' | while read -r entry; do
            name=$(echo "$entry" | jq -r '.name')
            artifact=$(echo "$entry" | jq -r '.artifactName')
            runId=$(echo "$entry" | jq -r '.runId')
            target="staging/${name}/.gitnexus"
            echo "Downloading $artifact from run $runId -> $target"
            mkdir -p "$target"
            if ! gh run download "$runId" \
                --repo "${{ github.repository }}" \
                --name "$artifact" \
                --dir "$target"; then
              case "$name" in
                LibreChat|LibreChat-dev)
                  echo "::error::Failed to download critical artifact $artifact"
                  exit 1
                  ;;
                *)
                  # The name stays in active_names so the prune step
                  # won't remove the droplet's existing copy. The old
                  # index keeps being served instead of being wiped to
                  # nothing — stale beats empty — but observability
                  # requires an explicit notice since this path is
                  # invisible in the happy-path deploy log.
                  echo "::warning::Failed to download PR artifact $artifact — skipping fresh sync; previous index (if any) will continue being served from the droplet"
                  rm -rf "staging/${name}"
                  ;;
              esac
            fi
          done
          echo ""
          echo "Staged for rsync:"
          du -sh staging/*/.gitnexus/ 2>/dev/null || echo "(none)"

      - name: Setup SSH
        env:
          SSH_KEY: ${{ secrets.GITNEXUS_DO_SSH_KEY }}
          KNOWN_HOST: ${{ secrets.GITNEXUS_DO_KNOWN_HOST }}
        run: |
          set -e
          mkdir -p ~/.ssh
          chmod 700 ~/.ssh
          printf '%s\n' "$SSH_KEY" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
          # Pin the droplet's SSH host key from a repository secret instead
          # of trusting whatever ssh-keyscan returns at deploy time. The
          # secret is populated from `ssh-keyscan -H <host>` at bootstrap.
          if [ -z "$KNOWN_HOST" ]; then
            echo "::error::GITNEXUS_DO_KNOWN_HOST secret is empty. Run ssh-keyscan -H <host> and paste the output as this secret."
            exit 1
          fi
          printf '%s\n' "$KNOWN_HOST" > ~/.ssh/known_hosts
          chmod 600 ~/.ssh/known_hosts

      - name: Authenticate droplet with GHCR
        # GHCR packages pushed by GITHUB_TOKEN start private. The droplet
        # pulls the image on every deploy, so we re-authenticate it here
        # using the same short-lived token. If the package is public, this
        # step is redundant but harmless.
        #
        # The token MUST travel through SSH stdin (not as a command arg)
        # so it's never visible in the droplet's process table via
        # /proc/<pid>/cmdline. `printf '%s'` is preferred over `echo`
        # so the exact byte sequence sent is explicit — docker login
        # tolerates a trailing newline but `printf` makes the intent
        # obvious and portable across shells.
        env:
          SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
          SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GH_ACTOR: ${{ github.actor }}
        run: |
          printf '%s' "$GH_TOKEN" | ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
            "docker login ghcr.io -u '$GH_ACTOR' --password-stdin"

      - name: Upload config files
        env:
          SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
          SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
        run: |
          rsync -az -e "ssh -i ~/.ssh/deploy_key" \
            .do/gitnexus/docker-compose.yml \
            .do/gitnexus/Caddyfile \
            "$SSH_USER@$SSH_HOST:/opt/gitnexus/"

      - name: Rsync indexes and prune stale ones
        env:
          SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
          SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
          ACTIVE_NAMES: ${{ steps.resolve.outputs.active_names }}
        run: |
          set -e
          # Push every active index up
          for dir in staging/*/; do
            [ -d "$dir" ] || continue
            name=$(basename "$dir")
            echo "Syncing $name"
            ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
              "mkdir -p /opt/gitnexus/indexes/$name"
            rsync -az --delete -e "ssh -i ~/.ssh/deploy_key" \
              "$dir" \
              "$SSH_USER@$SSH_HOST:/opt/gitnexus/indexes/$name/"
          done

          # Prune any folders on the droplet that aren't in the active set.
          # This cleans up closed PRs the cleanup workflow might have missed,
          # and is safe because main/dev/PR-<N> are always present if active.
          echo "Pruning stale indexes (keeping: $ACTIVE_NAMES)"
          ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
            ACTIVE_NAMES="$ACTIVE_NAMES" bash <<'REMOTE'
            set -e
            cd /opt/gitnexus/indexes || exit 0
            # nullglob makes `for dir in */` expand to nothing when the
            # directory is empty (first deploy), instead of the literal
            # string "*/". Explicit no-op > relying on rm -f to silently
            # tolerate a nonexistent file named "*".
            shopt -s nullglob
            IFS=',' read -ra ACTIVE <<< "$ACTIVE_NAMES"
            for dir in */; do
              dir="${dir%/}"
              keep=false
              for a in "${ACTIVE[@]}"; do
                if [ "$dir" = "$a" ]; then keep=true; break; fi
              done
              if [ "$keep" = false ]; then
                echo "Removing stale index: $dir"
                rm -rf "$dir"
              fi
            done
          REMOTE

      - name: Pull image, restart gitnexus, reload Caddy, wait for healthy
        env:
          SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
          SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
        run: |
          ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" bash <<'REMOTE'
            set -e
            cd /opt/gitnexus
            docker compose pull gitnexus
            docker compose up -d --force-recreate gitnexus

            # Reload Caddy in-place so a changed Caddyfile takes effect
            # without losing TLS certs or restarting connections. If caddy
            # isn't running yet (first-time bootstrap), bring it up.
            if docker compose ps --status running caddy 2>/dev/null | grep -q caddy; then
              echo "Reloading Caddy config"
              docker compose exec -T caddy caddy reload --config /etc/caddy/Caddyfile || {
                echo "Caddy reload failed — forcing restart"
                docker compose up -d --force-recreate caddy
              }
            else
              echo "Caddy not running — starting"
              docker compose up -d caddy
            fi

            # Poll gitnexus health until ready or timeout. Docker's own
            # unhealthy detection takes up to 150s (start_period 60s +
            # retries 3 * interval 30s), so the poll ceiling must clear
            # that to avoid false negatives when gitnexus legitimately
            # takes ~2.5 min to warm up.
            # Max wait = 36 sleeps * 5s = 180s (final iteration exits
            # before its sleep on failure, so 37 iterations is the
            # correct upper bound for a true 180s ceiling).
            echo "Waiting for gitnexus to report healthy..."
            for i in $(seq 1 37); do
              STATUS=$(docker inspect --format='{{.State.Health.Status}}' gitnexus 2>/dev/null || echo unknown)
              echo "[$i/37] gitnexus health: $STATUS"
              if [ "$STATUS" = "healthy" ]; then
                echo "gitnexus is healthy"
                break
              fi
              if [ "$i" -eq 37 ]; then
                echo "ERROR: gitnexus failed to become healthy after 180s"
                docker compose ps
                docker compose logs --tail 80 gitnexus
                exit 1
              fi
              sleep 5
            done

            docker compose ps
            echo "--- Caddy logs (last 20 lines) ---"
            docker compose logs --tail 20 caddy || true
            echo "--- GitNexus logs (last 30 lines) ---"
            docker compose logs --tail 30 gitnexus || true
          REMOTE

      - name: Cleanup SSH key
        if: always()
        run: rm -f ~/.ssh/deploy_key