mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
fix: remove naive secret detection from whitelist-check (gitleaks handles this)
The SECRET_VIOLATIONS check was causing false positives by flagging legitimate code that references keywords like 'api_key', 'password', 'token' (e.g., class attributes like `requires_api_key = False`). Gitleaks already runs as a separate workflow and handles secret detection with context-aware rules that don't produce these false positives.
This commit is contained in:
87
.github/scripts/file-whitelist-check.sh
vendored
87
.github/scripts/file-whitelist-check.sh
vendored
@@ -87,7 +87,6 @@ echo ""
|
||||
FILES_CHECKED=0
|
||||
WHITELIST_VIOLATIONS=()
|
||||
LARGE_FILES=()
|
||||
SECRET_VIOLATIONS=()
|
||||
BINARY_FILES=()
|
||||
SUSPICIOUS_FILES=()
|
||||
RESEARCH_DATA_VIOLATIONS=()
|
||||
@@ -138,61 +137,7 @@ if file "$file" | grep -q "binary"; then
|
||||
BINARY_FILES+=("$file")
|
||||
fi
|
||||
|
||||
# 4. Secret pattern check in file content - whitelist approach
|
||||
if [ -f "$file" ] && [ -r "$file" ]; then
|
||||
# Define whitelist patterns for legitimate files that can contain sensitive-looking keywords
|
||||
SAFE_FILE_PATTERNS=(
|
||||
"src/local_deep_research/metrics/.*\.py$"
|
||||
"src/local_deep_research/web_search_engines/.*\.py$"
|
||||
"src/local_deep_research/web/services/.*\.py$"
|
||||
"src/local_deep_research/utilities/.*\.py$"
|
||||
"src/local_deep_research/config/.*\.py$"
|
||||
"src/local_deep_research/.*migrate.*\.py$"
|
||||
"src/local_deep_research/web/database/.*migration.*\.py$"
|
||||
"src/local_deep_research/advanced_search_system/.*\.py$"
|
||||
"src/local_deep_research/benchmarks/.*\.py$"
|
||||
"src/local_deep_research/web/static/js/components/.*\.js$"
|
||||
"src/local_deep_research/database/.*\.py$"
|
||||
"src/local_deep_research/web/queue/.*\.py$"
|
||||
"src/local_deep_research/api/.*\.py$"
|
||||
"src/local_deep_research/settings/.*\.py$"
|
||||
"src/local_deep_research/web/auth/.*\.py$"
|
||||
"src/local_deep_research/news/.*\.py$"
|
||||
"src/local_deep_research/defaults/settings/.*\.json$"
|
||||
"src/local_deep_research/defaults/settings_.*\.json$"
|
||||
"src/local_deep_research/defaults/llm_providers/.*\.json$"
|
||||
"src/local_deep_research/defaults/research_library/.*\.json$"
|
||||
"docs/.*\.md$"
|
||||
"tests/.*\.py$"
|
||||
".*test.*\.py$"
|
||||
".*mock.*\.py$"
|
||||
".*example.*\.py$"
|
||||
"scripts/audit_.*\.py$"
|
||||
"\.github/CODEOWNERS$"
|
||||
"\.github/workflows/.*\.yml$"
|
||||
"github/scripts/.*\.sh$"
|
||||
)
|
||||
|
||||
# Check if file matches whitelist patterns
|
||||
FILE_WHITELISTED=false
|
||||
for pattern in "${SAFE_FILE_PATTERNS[@]}"; do
|
||||
if echo "$file" | grep -qE "$pattern"; then
|
||||
FILE_WHITELISTED=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Only check for secrets if file is not whitelisted
|
||||
if [ "$FILE_WHITELISTED" = "false" ]; then
|
||||
# Enhanced secret detection with LLM provider keys
|
||||
if grep -iE "(api[_-]?key|secret|password|token|private[_-]?key|sk-[a-zA-Z0-9]{20,}|claude-[a-zA-Z0-9]{20,}|AIzaSy[a-zA-Z0-9_-]{33})" "$file" >/dev/null 2>&1; then
|
||||
# Additional check for obvious false positives
|
||||
if ! grep -iE "(example|sample|test|mock|placeholder|<.*>|\{\{.*\}\})" "$file" >/dev/null 2>&1; then
|
||||
SECRET_VIOLATIONS+=("$file")
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
# 4. Secret pattern check - REMOVED: gitleaks workflow handles this more accurately
|
||||
|
||||
# 5. Suspicious filename patterns - whitelist approach
|
||||
SAFE_FILENAME_PATTERNS=(
|
||||
@@ -327,7 +272,6 @@ echo "📋 Summary of findings:"
|
||||
echo " - File type violations: ${#WHITELIST_VIOLATIONS[@]}"
|
||||
echo " - Large files: ${#LARGE_FILES[@]}"
|
||||
echo " - Binary files: ${#BINARY_FILES[@]}"
|
||||
echo " - Potential secrets: ${#SECRET_VIOLATIONS[@]}"
|
||||
echo " - Suspicious filenames: ${#SUSPICIOUS_FILES[@]}"
|
||||
echo " - Research data leaks: ${#RESEARCH_DATA_VIOLATIONS[@]}"
|
||||
echo " - Hardcoded Flask secrets: ${#FLASK_SECRET_VIOLATIONS[@]}"
|
||||
@@ -394,35 +338,6 @@ echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${#SECRET_VIOLATIONS[@]} -gt 0 ]; then
|
||||
echo ""
|
||||
echo "❌ POTENTIAL SECRETS IN FILE CONTENT - Suspicious patterns found:"
|
||||
echo " These files contain keywords like 'api_key', 'secret', 'password', 'token' but"
|
||||
echo " are not in the whitelisted safe directories. Review them carefully!"
|
||||
echo ""
|
||||
for violation in "${SECRET_VIOLATIONS[@]}"; do
|
||||
echo " 🔍 EXAMINING: $violation"
|
||||
|
||||
# Show the specific lines that triggered the detection
|
||||
echo " → Suspicious content found:"
|
||||
grep -n -iE "(api[_-]?key|secret|password|token|private[_-]?key)" "$violation" 2>/dev/null | head -5 | while read -r line; do
|
||||
echo " $line"
|
||||
done
|
||||
|
||||
# Show file type and size for context
|
||||
if [ -f "$violation" ]; then
|
||||
FILE_TYPE=$(file -b "$violation" 2>/dev/null || echo "unknown")
|
||||
FILE_SIZE=$(stat -c%s "$violation" 2>/dev/null || echo "unknown")
|
||||
echo " → File info: $FILE_TYPE (${FILE_SIZE} bytes)"
|
||||
fi
|
||||
|
||||
echo " → Issue: Contains sensitive-looking keywords outside whitelisted areas"
|
||||
echo " → Fix: Either add to SAFE_FILE_PATTERNS whitelist or remove secrets"
|
||||
echo ""
|
||||
done
|
||||
TOTAL_VIOLATIONS=$((TOTAL_VIOLATIONS + ${#SECRET_VIOLATIONS[@]}))
|
||||
fi
|
||||
|
||||
if [ ${#SUSPICIOUS_FILES[@]} -gt 0 ]; then
|
||||
echo ""
|
||||
echo "❌ SUSPICIOUS FILENAMES - Files with security-sensitive names:"
|
||||
|
||||
Reference in New Issue
Block a user