fact-checker-backend/app/api/fact_check.py

from fastapi import APIRouter, HTTPException
import asyncio
import logging
import httpx
import json
import re
from typing import Union, Optional, Dict, Any
from datetime import datetime
from app.config import OPENAI_API_KEY,PERPLEXITY_API_KEY
from app.api.scrap_websites import search_websites, SearchRequest
from app.services.openai_client import OpenAIClient, AIFactChecker
from app.services.image_text_extractor import ImageTextExtractor
from app.models.ai_fact_check_models import AIFactCheckResponse
from app.models.fact_check_models import (
    FactCheckRequest,
    FactCheckResponse,
    UnverifiedFactCheckResponse,
    Source,
    VerdictEnum,
    ConfidenceEnum
)

# Setup logging
logger = logging.getLogger(__name__)

fact_check_router = APIRouter()
openai_client = OpenAIClient(OPENAI_API_KEY)
ai_fact_checker = AIFactChecker(openai_client)
image_text_extractor = ImageTextExtractor(OPENAI_API_KEY)


async def process_url_content(url: str) -> Optional[str]:
    """Extract text content from the provided URL."""
    try:
        # Add await here
        text = await image_text_extractor.extract_text(url, is_url=True)
        if text:
            logger.info(f"Successfully extracted text from URL: {text}")
        else:
            logger.warning(f"No text could be extracted from URL: {url}")
        return text
    except Exception as e:
        logger.error(f"Error extracting text from URL: {str(e)}")
        return None

# Assuming the enums and models like FactCheckResponse, VerdictEnum, etc., are already imported

async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    if not PERPLEXITY_API_KEY:
        logger.error("Perplexity API key not configured")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="The fact-checking service is not properly configured.",
            explanation="The system is missing required API configuration for fact-checking services.",
            additional_context="This is a temporary system configuration issue."
        )

    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
    }

    payload = {
        "model": "sonar",
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a precise fact checker. Analyze the following claim and determine if it's true, false, or partially true. "
                    "Provide a clear verdict, confidence level (HIGH, MEDIUM, LOW), and cite reliable sources. "
                    "Format your response as JSON with fields: verdict, confidence, sources (array of URLs), "
                    "evidence (key facts as a string), and explanation (detailed reasoning as a string)."
                )
            },
            {
                "role": "user",
                "content": f"Fact check this claim: {query}"
            }
        ]
    }

    try:
        async with httpx.AsyncClient(timeout=30) as client:
            response = await client.post(url, headers=headers, json=payload)
            response.raise_for_status()
            result = response.json()
            perplexity_response = result["choices"][0]["message"]["content"]

            # Attempt to extract JSON
            try:
                parsed_data = json.loads(perplexity_response)
            except json.JSONDecodeError:
                match = re.search(r'\{.*\}', perplexity_response, re.DOTALL)
                if match:
                    parsed_data = json.loads(match.group(0))
                else:
                    parsed_data = extract_fact_check_info(perplexity_response)

            verdict_mapping = {
                "true": VerdictEnum.TRUE,
                "false": VerdictEnum.FALSE,
                "partially true": VerdictEnum.PARTIALLY_TRUE,
                "partially false": VerdictEnum.PARTIALLY_TRUE,
                "unverified": VerdictEnum.UNVERIFIED
            }

            confidence_mapping = {
                "high": ConfidenceEnum.HIGH,
                "medium": ConfidenceEnum.MEDIUM,
                "low": ConfidenceEnum.LOW
            }

            raw_verdict = parsed_data.get("verdict", "").lower()
            verdict = verdict_mapping.get(raw_verdict, VerdictEnum.UNVERIFIED)

            raw_confidence = parsed_data.get("confidence", "").lower()
            confidence = confidence_mapping.get(raw_confidence, ConfidenceEnum.MEDIUM)

            sources = [
                Source(
                    url=url,
                    domain=extract_domain(url),
                    title=f"Source from {extract_domain(url)}",
                    publisher=extract_domain(url),
                    date_published=None,
                    snippet="Source cited by Perplexity AI"
                )
                for url in parsed_data.get("sources", [])
            ]

            # Convert evidence to string if it's not already
            evidence = parsed_data.get("evidence", "")
            if isinstance(evidence, dict):
                # Convert dictionary evidence to string format
                evidence_str = ""
                for key, value in evidence.items():
                    evidence_str += f"{key}: {value}\n"
                evidence = evidence_str.strip()

            # Convert explanation to string if it's not already
            explanation = parsed_data.get("explanation", "")
            if isinstance(explanation, dict):
                explanation_str = ""
                for key, value in explanation.items():
                    explanation_str += f"{key}: {value}\n"
                explanation = explanation_str.strip()

            return FactCheckResponse(
                claim=query,
                verdict=verdict,
                confidence=confidence,
                sources=sources,
                evidence=evidence,
                explanation=explanation,
                additional_context=f"Fact checked using PlanPost AI on {datetime.now().strftime('%Y-%m-%d')}"
            )

    except Exception as e:
        logger.error(f"Fact check error: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence='No fact check results found.',
            explanation="Failed to contact Perplexity AI or parse its response.",
            additional_context="Possible API issue or malformed response."
        )


def extract_domain(url: str) -> str:
    """Extract domain from URL.

    Args:
        url: The URL to extract domain from

    Returns:
        The domain name or "unknown" if parsing fails
    """
    try:
        from urllib.parse import urlparse
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        return domain if domain else "unknown"
    except Exception as e:
        logger.warning(f"Failed to extract domain from URL {url}: {str(e)}")
        return "unknown"


def extract_fact_check_info(text_response: str) -> Dict[str, Any]:
    """Extract fact-checking information from a text response when JSON parsing fails.

    Args:
        text_response: The text response from Perplexity AI

    Returns:
        A dictionary with fact-checking information extracted from the text
    """
    import re

    result = {
        "verdict": "unverified",
        "confidence": "medium",
        "sources": [],
        "evidence": "",
        "explanation": ""
    }

    # Try to extract verdict with more comprehensive pattern matching
    verdict_patterns = [
        r'verdict[:\s]+(true|false|partially true|partially false|inconclusive|unverified)',
        r'(true|false|partially true|partially false|inconclusive|unverified)[:\s]+verdict',
        r'claim is (true|false|partially true|partially false|inconclusive|unverified)',
        r'statement is (true|false|partially true|partially false|inconclusive|unverified)'
    ]

    for pattern in verdict_patterns:
        verdict_match = re.search(pattern, text_response.lower(), re.IGNORECASE)
        if verdict_match:
            result["verdict"] = verdict_match.group(1)
            break

    # Try to extract confidence with multiple patterns
    confidence_patterns = [
        r'confidence[:\s]+(high|medium|low)',
        r'(high|medium|low)[:\s]+confidence',
        r'confidence level[:\s]+(high|medium|low)',
        r'(high|medium|low)[:\s]+confidence level'
    ]

    for pattern in confidence_patterns:
        confidence_match = re.search(pattern, text_response.lower(), re.IGNORECASE)
        if confidence_match:
            result["confidence"] = confidence_match.group(1)
            break

    # Try to extract URLs as sources - more robust pattern
    urls = re.findall(r'https?://[^\s"\'\]\)]+', text_response)
    # Filter out any malformed URLs
    valid_urls = []
    for url in urls:
        if '.' in url and len(url) > 10:  # Basic validation
            valid_urls.append(url)
    result["sources"] = valid_urls

    # Try to extract evidence and explanation with multiple patterns
    evidence_patterns = [
        r'evidence[:\s]+(.*?)(?=explanation|\Z)',
        r'key facts[:\s]+(.*?)(?=explanation|\Z)',
        r'facts[:\s]+(.*?)(?=explanation|\Z)'
    ]

    for pattern in evidence_patterns:
        evidence_match = re.search(pattern, text_response, re.IGNORECASE | re.DOTALL)
        if evidence_match:
            result["evidence"] = evidence_match.group(1).strip()
            break

    explanation_patterns = [
        r'explanation[:\s]+(.*?)(?=\Z)',
        r'reasoning[:\s]+(.*?)(?=\Z)',
        r'analysis[:\s]+(.*?)(?=\Z)'
    ]

    for pattern in explanation_patterns:
        explanation_match = re.search(pattern, text_response, re.IGNORECASE | re.DOTALL)
        if explanation_match:
            result["explanation"] = explanation_match.group(1).strip()
            break

    # If no structured information found, use the whole response as evidence
    if not result["evidence"] and not result["explanation"]:
        result["evidence"] = text_response
        # Generate a minimal explanation if none was found
        result["explanation"] = "The fact-checking service provided information about this claim but did not structure it in the expected format. The full response has been included as evidence for you to review."

    return result


async def generate_fact_report(query: str, fact_check_data: dict | AIFactCheckResponse) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    """Generate a fact check report using OpenAI based on the fact check results."""
    try:
        base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.

Rules:
1. Include all source URLs and names in the sources list
2. Keep the explanation focused on verifiable facts
3. Include dates when available
4. Maintain objectivity in the report
5. If no reliable sources are found, provide a clear explanation why"""

        # Handle both dictionary and AIFactCheckResponse
        if hasattr(fact_check_data, 'verification_result'):
            # It's an AIFactCheckResponse
            has_sources = bool(fact_check_data.sources)
            verification_result = fact_check_data.verification_result
            fact_check_data_dict = fact_check_data.dict()
        else:
            # It's a dictionary
            has_sources = bool(fact_check_data.get("claims") or fact_check_data.get("urls_found"))
            verification_result = fact_check_data.get("verification_result", {})
            fact_check_data_dict = fact_check_data

        # If no sources were found, return an unverified response
        if not has_sources or (
            isinstance(fact_check_data, dict) and
            fact_check_data.get("status") == "no_results"
        ) or (verification_result and verification_result.get("no_sources_found")):
            return UnverifiedFactCheckResponse(
                claim=query,
                verdict=VerdictEnum.UNVERIFIED,
                confidence=ConfidenceEnum.LOW,
                sources=[],
                evidence="No fact-checking sources have verified this claim yet.",
                explanation="Our search across reputable fact-checking websites did not find any formal verification of this claim. This doesn't mean the claim is false - just that it hasn't been formally fact-checked yet.",
                additional_context="The claim may be too recent for fact-checkers to have investigated, or it may not have been widely circulated enough to warrant formal fact-checking."
            )

        base_user_prompt = """Generate a comprehensive fact check report in this exact JSON format:
{
    "claim": "Write the exact claim being verified",
    "verdict": "One of: True/False/Partially True/Unverified",
    "confidence": "One of: High/Medium/Low",
    "sources": [
        {
            "url": "Full URL of the source",
            "name": "Name of the source organization"
        }
    ],
    "evidence": "A concise summary of the key evidence (1-2 sentences)",
    "explanation": "A detailed explanation including who verified it, when it was verified, and the key findings (2-3 sentences)",
    "additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
}"""

        if isinstance(fact_check_data, dict) and "claims" in fact_check_data:
            system_prompt = base_system_prompt
            user_prompt = f"""Query: {query}
            Fact Check Results: {fact_check_data_dict}

            {base_user_prompt}

            The report should:
            1. Include ALL source URLs and organization names
            2. Specify verification dates when available
            3. Name the fact-checking organizations involved
            4. Describe the verification process"""
        else:
            system_prompt = base_system_prompt
            user_prompt = f"""Query: {query}
            Fact Check Results: {fact_check_data_dict}

            {base_user_prompt}

            The report should:
            1. Include ALL source URLs and names from both verification_result and sources fields
            2. Mention all fact-checking organizations involved
            3. Describe the verification process
            4. Note any conflicting information between sources"""

        response = await openai_client.generate_text_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            max_tokens=1000
        )

        try:
            response_data = response["response"]

            if isinstance(response_data.get("sources"), list):
                cleaned_sources = []
                for source in response_data["sources"]:
                    if isinstance(source, str):
                        url = source if source.startswith("http") else f"https://{source}"
                        cleaned_sources.append({"url": url, "name": source})
                    elif isinstance(source, dict):
                        url = source.get("url", "")
                        if url and not url.startswith("http"):
                            source["url"] = f"https://{url}"
                        cleaned_sources.append(source)
                response_data["sources"] = cleaned_sources

            if response_data["verdict"] == "Unverified" or not response_data.get("sources"):
                return UnverifiedFactCheckResponse(**response_data)
            return FactCheckResponse(**response_data)

        except Exception as validation_error:
            logger.error(f"Response validation error: {str(validation_error)}")
            return UnverifiedFactCheckResponse(
                claim=query,
                verdict=VerdictEnum.UNVERIFIED,
                confidence=ConfidenceEnum.LOW,
                sources=[],
                evidence="An error occurred while processing the fact check results.",
                explanation="The system encountered an error while validating the fact check results.",
                additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
            )

    except Exception as e:
        logger.error(f"Error generating fact report: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="An error occurred while generating the fact check report.",
            explanation="The system encountered an unexpected error while processing the fact check request.",
            additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
        )

async def combine_fact_reports(query: str, url_text: str, query_result: Dict[str, Any], url_result: Dict[str, Any]) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    """Combine fact check results from query and URL into a single comprehensive report."""
    try:
        system_prompt = """You are a professional fact-checking reporter. Your task is to create a comprehensive fact check report by combining and analyzing multiple fact-checking results. Focus on accuracy, clarity, and proper citation of all sources.

Rules:
1. Include all source URLs and names from both result sets
2. Compare and contrast findings from different sources
3. Include dates when available
4. Note any discrepancies between sources
5. Provide a balanced, objective analysis"""

        user_prompt = f"""Original Query: {query}
Extracted Text from URL: {url_text}

First Fact Check Result: {query_result}
Second Fact Check Result: {url_result}

Generate a comprehensive fact check report in this exact JSON format:
{{
    "claim": "Write the exact claim being verified",
    "verdict": "One of: True/False/Partially True/Unverified",
    "confidence": "One of: High/Medium/Low",
    "sources": [
        {{
            "url": "Full URL of the source",
            "name": "Name of the source organization"
        }}
    ],
    "evidence": "A concise summary of the key evidence from both sources (2-3 sentences)",
    "explanation": "A detailed explanation combining findings from both fact checks (3-4 sentences)",
    "additional_context": "Important context about differences or similarities in findings (1-2 sentences)"
}}

The report should:
1. Combine sources from both fact checks
2. Compare findings from both analyses
3. Note any differences in conclusions
4. Provide a unified verdict based on all available information"""

        response = await openai_client.generate_text_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            max_tokens=1000
        )

        response_data = response["response"]

        # Clean up sources from both results
        if isinstance(response_data.get("sources"), list):
            cleaned_sources = []
            for source in response_data["sources"]:
                if isinstance(source, str):
                    url = source if source.startswith("http") else f"https://{source}"
                    cleaned_sources.append({"url": url, "name": source})
                elif isinstance(source, dict):
                    url = source.get("url", "")
                    if url and not url.startswith("http"):
                        source["url"] = f"https://{url}"
                    cleaned_sources.append(source)
            response_data["sources"] = cleaned_sources

        if response_data["verdict"] == "Unverified" or not response_data.get("sources"):
            return UnverifiedFactCheckResponse(**response_data)
        return FactCheckResponse(**response_data)

    except Exception as e:
        logger.error(f"Error combining fact reports: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="An error occurred while combining fact check reports.",
            explanation="The system encountered an error while trying to combine results from multiple sources.",
            additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
        )


@fact_check_router.post("/check-facts", response_model=Union[FactCheckResponse, UnverifiedFactCheckResponse])
async def check_facts(request: FactCheckRequest):
    """
    Fetch fact check results and generate a comprehensive report.
    Handles both query-based and URL-based fact checking.
    Always returns a 200 response with appropriate content, never an error.
    """
    try:
        url_text = None
        query_result = None
        url_result = None

        # If URL is provided, try to extract text
        if request.url:
            try:
                url_text = await process_url_content(request.url)
            except Exception as e:
                logger.error(f"Error extracting text from URL: {str(e)}")
                url_text = None

            if not url_text and not request.query:
                # Only return early if URL text extraction failed and no query provided
                return UnverifiedFactCheckResponse(
                    claim=f"URL check requested: {request.url}",
                    verdict=VerdictEnum.UNVERIFIED,
                    confidence=ConfidenceEnum.LOW,
                    sources=[],
                    evidence="No fact check results found",
                    explanation="The system encountered errors while processing the fact checks.",
                    additional_context="Please try again with different input or contact support if the issue persists."
                )

            # If URL text was successfully extracted, process it
            if url_text:
                logger.info(f"Processing fact check for extracted text: {url_text}")
                try:
                    url_result = await process_fact_check(url_text)
                except Exception as e:
                    logger.error(f"Error processing fact check for URL text: {str(e)}")
                    url_result = UnverifiedFactCheckResponse(
                        claim=f"URL: {request.url}",
                        verdict=VerdictEnum.UNVERIFIED,
                        confidence=ConfidenceEnum.LOW,
                        sources=[],
                        evidence="No fact check results found",
                        explanation="The system encountered errors while processing the fact checks.",
                        additional_context="Please try again with different input or contact support if the issue persists."
                    )

        # Process query if provided
        if request.query:
            try:
                query_result = await process_fact_check(request.query)
            except Exception as e:
                logger.error(f"Error processing fact check for query: {str(e)}")
                query_result = UnverifiedFactCheckResponse(
                    claim=request.query,
                    verdict=VerdictEnum.UNVERIFIED,
                    confidence=ConfidenceEnum.LOW,
                    sources=[],
                    evidence="No fact check results found",
                    explanation="The system encountered errors while processing the fact checks.",
                    additional_context="Please try again with different input or contact support if the issue persists."
                )

        # If both results are available, combine them
        if query_result and url_result and url_text:
            try:
                return await combine_fact_reports(request.query, url_text,
                                                query_result.dict(), url_result.dict())
            except Exception as e:
                logger.error(f"Error combining fact reports: {str(e)}")
                return UnverifiedFactCheckResponse(
                    claim=request.query or f"URL: {request.url}",
                    verdict=VerdictEnum.UNVERIFIED,
                    confidence=ConfidenceEnum.LOW,
                    sources=[],
                    evidence="No fact check results found",
                    explanation="The system encountered errors while processing the fact checks.",
                    additional_context="Please try again with different input or contact support if the issue persists."
                )

        # If only one result is available
        if query_result:
            return query_result
        if url_result:
            return url_result

        # If no valid results
        return UnverifiedFactCheckResponse(
            claim=request.query or f"URL: {request.url}",
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="No fact check results found",
            explanation="The system encountered errors while processing the fact checks.",
            additional_context="Please try again with different input or contact support if the issue persists."
        )

    except Exception as e:
        # Catch-all exception handler to ensure we always return a 200 response
        logger.error(f"Unexpected error in check_facts: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=request.query or f"URL: {request.url}",
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="No fact check results found",
            explanation="The system encountered errors while processing the fact checks.",
            additional_context="Please try again with different input or contact support if the issue persists."
        )