local-deep-research/examples/api_usage/programmatic/advanced_features_example.py

#!/usr/bin/env python3
"""
Advanced Features Example for Local Deep Research

This example demonstrates advanced programmatic features including:
1. generate_report() - Create comprehensive markdown reports
2. Export formats - Save reports in different formats
3. Result analysis - Extract and analyze research findings
4. Keyword extraction - Identify key topics and concepts
"""

import json
from typing import Dict, List, Any

from local_deep_research.api import (
    generate_report,
    detailed_research,
    quick_summary,
)
from local_deep_research.api.settings_utils import create_settings_snapshot


def demonstrate_report_generation():
    """
    Generate a comprehensive research report using generate_report().

    This function creates a structured markdown report with:
    - Executive summary
    - Detailed findings organized by sections
    - Source citations
    - Conclusions and recommendations
    """
    print("=" * 70)
    print("GENERATE COMPREHENSIVE REPORT")
    print("=" * 70)
    print("""
This demonstrates the generate_report() function which:
- Creates a structured markdown report
- Performs multiple searches per section
- Organizes findings into coherent sections
- Includes citations and references
    """)

    # Configure settings for programmatic mode
    settings = create_settings_snapshot(
        overrides={
            "programmatic_mode": True,
            "search.tool": "wikipedia",
            "llm.temperature": 0.5,  # Lower for more focused output
            "api.allow_file_output": True,  # Allow generate_report to save files
        }
    )

    # Generate a comprehensive report
    print(
        "Generating report on 'Applications of Machine Learning in Healthcare'..."
    )
    report = generate_report(
        query="Applications of Machine Learning in Healthcare",
        output_file="ml_healthcare_report.md",
        searches_per_section=2,  # Multiple searches per section for depth
        settings_snapshot=settings,
        iterations=2,
        questions_per_iteration=3,
    )

    print("\n✓ Report generated successfully!")
    print(f"  - Report length: {len(report['content'])} characters")
    print(
        f"  - File saved to: {report.get('file_path', 'ml_healthcare_report.md')}"
    )

    # Show first part of report
    print("\nReport preview (first 500 chars):")
    print("-" * 40)
    print(report["content"][:500] + "...")

    return report


def demonstrate_export_formats():
    """
    Show how to export research results in different formats.

    Demonstrates:
    - Markdown export (default)
    - JSON export for programmatic processing
    - Custom formatting with templates
    """
    print("\n" + "=" * 70)
    print("EXPORT FORMATS")
    print("=" * 70)
    print("""
Exporting research in different formats:
- Markdown: Human-readable reports
- JSON: Structured data for processing
- Custom: Template-based formatting
    """)

    settings = create_settings_snapshot(
        overrides={
            "programmatic_mode": True,
            "search.tool": "wikipedia",
        }
    )

    # Get research results
    result = detailed_research(
        query="Renewable energy technologies",
        settings_snapshot=settings,
        iterations=1,
        questions_per_iteration=2,
    )

    # Export as JSON
    json_file = "research_results.json"
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, default=str)
    print(f"\n✓ JSON export saved to: {json_file}")
    print(f"  - Contains: {len(result.get('findings', []))} findings")
    print(f"  - Sources: {len(result.get('sources', []))} sources")

    # Export as Markdown
    md_content = format_as_markdown(result)
    md_file = "research_results.md"
    with open(md_file, "w", encoding="utf-8") as f:
        f.write(md_content)
    print(f"\n✓ Markdown export saved to: {md_file}")
    print(f"  - Length: {len(md_content)} characters")

    # Export as custom format (e.g., BibTeX-like citations)
    citations = extract_citations(result)
    cite_file = "research_citations.txt"
    with open(cite_file, "w", encoding="utf-8") as f:
        for i, citation in enumerate(citations, 1):
            f.write(f"[{i}] {citation}\n")
    print(f"\n✓ Citations export saved to: {cite_file}")
    print(f"  - Total citations: {len(citations)}")

    return result


def demonstrate_result_analysis():
    """
    Analyze research results to extract insights and patterns.

    Shows how to:
    - Extract key findings
    - Identify recurring themes
    - Analyze source reliability
    - Generate statistics
    """
    print("\n" + "=" * 70)
    print("RESULT ANALYSIS")
    print("=" * 70)
    print("""
Analyzing research results to extract:
- Key findings and insights
- Common themes and patterns
- Source statistics
- Quality metrics
    """)

    settings = create_settings_snapshot(
        overrides={
            "programmatic_mode": True,
            "search.tool": "wikipedia",
        }
    )

    # Perform research
    result = detailed_research(
        query="Impact of artificial intelligence on employment",
        settings_snapshot=settings,
        search_strategy="source-based",
        iterations=2,
        questions_per_iteration=3,
    )

    # Analyze findings
    analysis = analyze_findings(result)

    print("\n📊 Research Analysis:")
    print(f"  - Total findings: {analysis['total_findings']}")
    print(f"  - Unique sources: {analysis['unique_sources']}")
    print(f"  - Questions explored: {analysis['total_questions']}")
    print(f"  - Iterations completed: {analysis['iterations']}")

    print("\n🔍 Finding Categories:")
    for category, count in analysis["categories"].items():
        print(f"  - {category}: {count} findings")

    print("\n📈 Source Distribution:")
    for source_type, count in analysis["source_types"].items():
        print(f"  - {source_type}: {count} sources")

    # Extract themes
    themes = extract_themes(result)
    print("\n🎯 Key Themes Identified:")
    for i, theme in enumerate(themes[:5], 1):
        print(f"  {i}. {theme}")

    return analysis


def demonstrate_keyword_extraction():
    """
    Extract keywords and key concepts from research results.

    Demonstrates:
    - Keyword extraction from findings
    - Concept identification
    - Topic clustering
    - Trend analysis
    """
    print("\n" + "=" * 70)
    print("KEYWORD & CONCEPT EXTRACTION")
    print("=" * 70)
    print("""
Extracting keywords and concepts:
- Important terms and phrases
- Technical concepts
- Named entities
- Trend indicators
    """)

    settings = create_settings_snapshot(
        overrides={
            "programmatic_mode": True,
            "search.tool": "wikipedia",
        }
    )

    # Quick research for keyword extraction
    result = quick_summary(
        query="Quantum computing breakthroughs 2024",
        settings_snapshot=settings,
        iterations=1,
        questions_per_iteration=3,
    )

    # Extract keywords
    keywords = extract_keywords(result)

    print("\n🔑 Top Keywords:")
    for keyword, frequency in keywords[:10]:
        print(f"  - {keyword}: {frequency} occurrences")

    # Extract concepts
    concepts = extract_concepts(result)

    print("\n💡 Key Concepts:")
    for i, concept in enumerate(concepts[:5], 1):
        print(f"  {i}. {concept}")

    # Identify technical terms
    technical_terms = extract_technical_terms(result)

    print("\n🔬 Technical Terms:")
    for term in technical_terms[:8]:
        print(f"  - {term}")

    return keywords, concepts


def format_as_markdown(result: Dict[str, Any]) -> str:
    """Convert research results to markdown format."""
    md = f"# Research Report: {result['query']}\n\n"
    md += f"**Research ID:** {result.get('research_id', 'N/A')}\n\n"

    # Summary
    md += "## Summary\n\n"
    md += result.get("summary", "No summary available") + "\n\n"

    # Findings
    md += "## Key Findings\n\n"
    findings = result.get("findings", [])
    for i, finding in enumerate(findings, 1):
        finding_text = finding if isinstance(finding, str) else str(finding)
        md += f"{i}. {finding_text}\n\n"

    # Sources
    md += "## Sources\n\n"
    sources = result.get("sources", [])
    for i, source in enumerate(sources, 1):
        source_text = source if isinstance(source, str) else str(source)
        md += f"- [{i}] {source_text}\n"

    # Metadata
    md += "\n## Metadata\n\n"
    metadata = result.get("metadata", {})
    for key, value in metadata.items():
        md += f"- **{key}:** {value}\n"

    return md


def extract_citations(result: Dict[str, Any]) -> List[str]:
    """Extract citations from research results."""
    citations = []
    sources = result.get("sources", [])

    for source in sources:
        if isinstance(source, dict):
            # Extract URL or title
            citation = source.get("url", source.get("title", str(source)))
        else:
            citation = str(source)
        citations.append(citation)

    return citations


def analyze_findings(result: Dict[str, Any]) -> Dict[str, Any]:
    """Analyze research findings for patterns and statistics."""
    findings = result.get("findings", [])
    sources = result.get("sources", [])
    questions = result.get("questions", {})

    # Categorize findings (simplified)
    categories = {
        "positive": 0,
        "negative": 0,
        "neutral": 0,
        "technical": 0,
    }

    for finding in findings:
        finding_text = str(finding).lower()
        if any(
            word in finding_text
            for word in ["benefit", "improve", "enhance", "positive"]
        ):
            categories["positive"] += 1
        elif any(
            word in finding_text
            for word in ["risk", "challenge", "negative", "concern"]
        ):
            categories["negative"] += 1
        elif any(
            word in finding_text
            for word in ["algorithm", "system", "technology", "method"]
        ):
            categories["technical"] += 1
        else:
            categories["neutral"] += 1

    # Analyze sources
    source_types = {}
    for source in sources:
        source_text = str(source).lower()
        if "wikipedia" in source_text:
            source_type = "Wikipedia"
        elif "arxiv" in source_text:
            source_type = "ArXiv"
        elif "github" in source_text:
            source_type = "GitHub"
        else:
            source_type = "Other"
        source_types[source_type] = source_types.get(source_type, 0) + 1

    return {
        "total_findings": len(findings),
        "unique_sources": len(sources),
        "total_questions": sum(len(qs) for qs in questions.values()),
        "iterations": result.get("iterations", 0),
        "categories": categories,
        "source_types": source_types,
    }


def extract_themes(result: Dict[str, Any]) -> List[str]:
    """Extract main themes from research results."""
    # Simplified theme extraction based on common patterns
    themes = []
    summary = result.get("summary", "")
    findings = result.get("findings", [])

    # Combine text for analysis
    full_text = summary + " ".join(str(f) for f in findings)

    # Simple theme patterns (in production, use NLP libraries)
    theme_patterns = {
        "automation": ["automation", "automated", "automatic"],
        "job displacement": ["job loss", "unemployment", "displacement"],
        "skill requirements": ["skills", "training", "education"],
        "economic impact": ["economy", "economic", "gdp", "growth"],
        "innovation": ["innovation", "innovative", "breakthrough"],
    }

    for theme, keywords in theme_patterns.items():
        if any(keyword in full_text.lower() for keyword in keywords):
            themes.append(theme.title())

    return themes


def extract_keywords(result: Dict[str, Any]) -> List[tuple]:
    """Extract keywords with frequency from research results."""
    from collections import Counter
    import re

    # Combine all text
    summary = result.get("summary", "")
    findings = " ".join(str(f) for f in result.get("findings", []))
    full_text = f"{summary} {findings}".lower()

    # Simple word extraction (in production, use NLP libraries)
    words = re.findall(r"\b[a-z]{4,}\b", full_text)

    # Filter common words
    stopwords = {
        "that",
        "this",
        "with",
        "from",
        "have",
        "been",
        "were",
        "which",
        "their",
        "about",
    }
    words = [w for w in words if w not in stopwords]

    # Count frequencies
    word_freq = Counter(words)

    return word_freq.most_common(20)


def extract_concepts(result: Dict[str, Any]) -> List[str]:
    """Extract key concepts from research results."""
    concepts = []
    summary = result.get("summary", "")

    # Simple concept patterns (in production, use NLP for entity extraction)
    concept_patterns = [
        r"quantum \w+",
        r"\w+ computing",
        r"\w+ algorithm",
        r"machine learning",
        r"artificial intelligence",
        r"\w+ technology",
    ]

    import re

    for pattern in concept_patterns:
        matches = re.findall(pattern, summary.lower())
        concepts.extend(matches)

    # Deduplicate and clean
    concepts = list(set(concepts))

    return concepts[:10]


def extract_technical_terms(result: Dict[str, Any]) -> List[str]:
    """Extract technical terms from research results."""
    technical_terms = []

    # Common technical term patterns
    tech_indicators = [
        "algorithm",
        "system",
        "protocol",
        "framework",
        "architecture",
        "quantum",
        "neural",
        "network",
        "model",
        "optimization",
    ]

    summary = result.get("summary", "").lower()
    import re

    for indicator in tech_indicators:
        # Find words containing or adjacent to technical indicators
        pattern = rf"\b\w*{indicator}\w*\b"
        matches = re.findall(pattern, summary)
        technical_terms.extend(matches)

    # Deduplicate
    technical_terms = list(set(technical_terms))

    return technical_terms


def demonstrate_batch_research():
    """
    Show how to perform batch research on multiple topics.

    Useful for:
    - Comparative analysis
    - Trend monitoring
    - Systematic reviews
    """
    print("\n" + "=" * 70)
    print("BATCH RESEARCH PROCESSING")
    print("=" * 70)
    print("""
Processing multiple research queries:
- Efficient batch processing
- Comparative analysis
- Result aggregation
    """)

    settings = create_settings_snapshot(
        overrides={
            "programmatic_mode": True,
            "search.tool": "wikipedia",
        }
    )

    # Topics for batch research
    topics = [
        "Solar energy innovations",
        "Wind power technology",
        "Hydrogen fuel cells",
    ]

    batch_results = {}

    print("\n📚 Batch Research:")
    for topic in topics:
        print(f"\n  Researching: {topic}")
        result = quick_summary(
            query=topic,
            settings_snapshot=settings,
            iterations=1,
            questions_per_iteration=2,
        )
        batch_results[topic] = result
        print(f"    ✓ Found {len(result.get('findings', []))} findings")

    # Aggregate results
    print("\n📊 Aggregate Analysis:")
    total_findings = sum(
        len(r.get("findings", [])) for r in batch_results.values()
    )
    total_sources = sum(
        len(r.get("sources", [])) for r in batch_results.values()
    )

    print(f"  - Total topics researched: {len(topics)}")
    print(f"  - Total findings: {total_findings}")
    print(f"  - Total sources: {total_sources}")
    print(f"  - Average findings per topic: {total_findings / len(topics):.1f}")

    # Save batch results
    batch_file = "batch_research_results.json"
    with open(batch_file, "w", encoding="utf-8") as f:
        json.dump(batch_results, f, indent=2, default=str)
    print(f"\n✓ Batch results saved to: {batch_file}")

    return batch_results


def main():
    """Run all advanced feature demonstrations."""
    print("=" * 70)
    print("LOCAL DEEP RESEARCH - ADVANCED FEATURES DEMONSTRATION")
    print("=" * 70)
    print("""
This example demonstrates advanced programmatic features:
1. Report generation with generate_report()
2. Multiple export formats
3. Result analysis and insights
4. Keyword and concept extraction
5. Batch research processing
    """)

    # Run demonstrations
    demonstrate_report_generation()
    demonstrate_export_formats()
    demonstrate_result_analysis()
    demonstrate_keyword_extraction()
    demonstrate_batch_research()

    print("\n" + "=" * 70)
    print("DEMONSTRATION COMPLETE")
    print("=" * 70)
    print("""
✓ All advanced features demonstrated successfully!

Key Takeaways:
1. generate_report() creates comprehensive markdown reports
2. Results can be exported in multiple formats (JSON, MD, custom)
3. Analysis tools extract insights, themes, and patterns
4. Keyword extraction identifies important terms and concepts
5. Batch processing enables systematic research

Files created:
- ml_healthcare_report.md - Full research report
- research_results.json - Structured research data
- research_results.md - Markdown formatted results
- research_citations.txt - Extracted citations
- batch_research_results.json - Batch research results

Next Steps:
- Customize report templates for your domain
- Integrate with data visualization tools
- Build automated research pipelines
- Create domain-specific analysis functions
    """)


if __name__ == "__main__":
    main()