fix: remove naive secret detection from whitelist-check (gitleaks handles this)

The SECRET_VIOLATIONS check was causing false positives by flagging
legitimate code that references keywords like 'api_key', 'password',
'token' (e.g., class attributes like `requires_api_key = False`).

Gitleaks already runs as a separate workflow and handles secret detection
with context-aware rules that don't produce these false positives.
This commit is contained in:
LearningCircuit
2025-12-05 00:08:31 +01:00
parent 484f32741d
commit 730a17446b

View File

@@ -87,7 +87,6 @@ echo ""
FILES_CHECKED=0
WHITELIST_VIOLATIONS=()
LARGE_FILES=()
SECRET_VIOLATIONS=()
BINARY_FILES=()
SUSPICIOUS_FILES=()
RESEARCH_DATA_VIOLATIONS=()
@@ -138,61 +137,7 @@ if file "$file" | grep -q "binary"; then
BINARY_FILES+=("$file")
fi
# 4. Secret pattern check in file content - whitelist approach
if [ -f "$file" ] && [ -r "$file" ]; then
# Define whitelist patterns for legitimate files that can contain sensitive-looking keywords
SAFE_FILE_PATTERNS=(
"src/local_deep_research/metrics/.*\.py$"
"src/local_deep_research/web_search_engines/.*\.py$"
"src/local_deep_research/web/services/.*\.py$"
"src/local_deep_research/utilities/.*\.py$"
"src/local_deep_research/config/.*\.py$"
"src/local_deep_research/.*migrate.*\.py$"
"src/local_deep_research/web/database/.*migration.*\.py$"
"src/local_deep_research/advanced_search_system/.*\.py$"
"src/local_deep_research/benchmarks/.*\.py$"
"src/local_deep_research/web/static/js/components/.*\.js$"
"src/local_deep_research/database/.*\.py$"
"src/local_deep_research/web/queue/.*\.py$"
"src/local_deep_research/api/.*\.py$"
"src/local_deep_research/settings/.*\.py$"
"src/local_deep_research/web/auth/.*\.py$"
"src/local_deep_research/news/.*\.py$"
"src/local_deep_research/defaults/settings/.*\.json$"
"src/local_deep_research/defaults/settings_.*\.json$"
"src/local_deep_research/defaults/llm_providers/.*\.json$"
"src/local_deep_research/defaults/research_library/.*\.json$"
"docs/.*\.md$"
"tests/.*\.py$"
".*test.*\.py$"
".*mock.*\.py$"
".*example.*\.py$"
"scripts/audit_.*\.py$"
"\.github/CODEOWNERS$"
"\.github/workflows/.*\.yml$"
"github/scripts/.*\.sh$"
)
# Check if file matches whitelist patterns
FILE_WHITELISTED=false
for pattern in "${SAFE_FILE_PATTERNS[@]}"; do
if echo "$file" | grep -qE "$pattern"; then
FILE_WHITELISTED=true
break
fi
done
# Only check for secrets if file is not whitelisted
if [ "$FILE_WHITELISTED" = "false" ]; then
# Enhanced secret detection with LLM provider keys
if grep -iE "(api[_-]?key|secret|password|token|private[_-]?key|sk-[a-zA-Z0-9]{20,}|claude-[a-zA-Z0-9]{20,}|AIzaSy[a-zA-Z0-9_-]{33})" "$file" >/dev/null 2>&1; then
# Additional check for obvious false positives
if ! grep -iE "(example|sample|test|mock|placeholder|<.*>|\{\{.*\}\})" "$file" >/dev/null 2>&1; then
SECRET_VIOLATIONS+=("$file")
fi
fi
fi
fi
# 4. Secret pattern check - REMOVED: gitleaks workflow handles this more accurately
# 5. Suspicious filename patterns - whitelist approach
SAFE_FILENAME_PATTERNS=(
@@ -327,7 +272,6 @@ echo "📋 Summary of findings:"
echo " - File type violations: ${#WHITELIST_VIOLATIONS[@]}"
echo " - Large files: ${#LARGE_FILES[@]}"
echo " - Binary files: ${#BINARY_FILES[@]}"
echo " - Potential secrets: ${#SECRET_VIOLATIONS[@]}"
echo " - Suspicious filenames: ${#SUSPICIOUS_FILES[@]}"
echo " - Research data leaks: ${#RESEARCH_DATA_VIOLATIONS[@]}"
echo " - Hardcoded Flask secrets: ${#FLASK_SECRET_VIOLATIONS[@]}"
@@ -394,35 +338,6 @@ echo ""
done
fi
if [ ${#SECRET_VIOLATIONS[@]} -gt 0 ]; then
echo ""
echo "❌ POTENTIAL SECRETS IN FILE CONTENT - Suspicious patterns found:"
echo " These files contain keywords like 'api_key', 'secret', 'password', 'token' but"
echo " are not in the whitelisted safe directories. Review them carefully!"
echo ""
for violation in "${SECRET_VIOLATIONS[@]}"; do
echo " 🔍 EXAMINING: $violation"
# Show the specific lines that triggered the detection
echo " → Suspicious content found:"
grep -n -iE "(api[_-]?key|secret|password|token|private[_-]?key)" "$violation" 2>/dev/null | head -5 | while read -r line; do
echo " $line"
done
# Show file type and size for context
if [ -f "$violation" ]; then
FILE_TYPE=$(file -b "$violation" 2>/dev/null || echo "unknown")
FILE_SIZE=$(stat -c%s "$violation" 2>/dev/null || echo "unknown")
echo " → File info: $FILE_TYPE (${FILE_SIZE} bytes)"
fi
echo " → Issue: Contains sensitive-looking keywords outside whitelisted areas"
echo " → Fix: Either add to SAFE_FILE_PATTERNS whitelist or remove secrets"
echo ""
done
TOTAL_VIOLATIONS=$((TOTAL_VIOLATIONS + ${#SECRET_VIOLATIONS[@]}))
fi
if [ ${#SUSPICIOUS_FILES[@]} -gt 0 ]; then
echo ""
echo "❌ SUSPICIOUS FILENAMES - Files with security-sensitive names:"