fact-checker-backend/app/api/fact_check.py

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
from typing import Dict, List, Optional, Union
import requests
from enum import Enum
from datetime import datetime
import json
from app.config import GOOGLE_FACT_CHECK_API_KEY, GOOGLE_FACT_CHECK_BASE_URL

fact_check_router = APIRouter()

class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

class ErrorResponse(BaseModel):
    detail: str
    error_code: str = Field(..., description="Unique error code for this type of error")
    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
    path: Optional[str] = Field(None, description="The endpoint path where error occurred")

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "detail": "Error description",
            "error_code": "ERROR_CODE",
            "timestamp": "2024-12-09T16:49:30.905765",
            "path": "/check-facts"
        }
    })

class RequestValidationError(BaseModel):
    loc: List[str]
    msg: str
    type: str

class Publisher(BaseModel):
    name: str
    site: Optional[str] = Field(None, description="Publisher's website")

    @validator('site')
    def validate_site(cls, v):
        if v and not (v.startswith('http://') or v.startswith('https://')):
            return f"https://{v}"
        return v

class ClaimReview(BaseModel):
    publisher: Publisher
    url: Optional[HttpUrl] = None
    title: Optional[str] = None
    reviewDate: Optional[str] = None
    textualRating: Optional[str] = None
    languageCode: str = Field(default="en-US")

class Claim(BaseModel):
    text: str
    claimant: Optional[str] = None
    claimDate: Optional[str] = None
    claimReview: List[ClaimReview]

class FactCheckResponse(BaseModel):
    query: str = Field(..., description="Original query that was fact-checked")
    total_claims_found: int = Field(..., ge=0)
    results: List[Claim] = Field(default_factory=list)
    summary: Dict[str, int] = Field(...)

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example claim",
            "total_claims_found": 1,
            "results": [{
                "text": "Example claim text",
                "claimant": "Source name",
                "claimReview": [{
                    "publisher": {
                        "name": "Fact Checker",
                        "site": "factchecker.com"
                    },
                    "textualRating": "True"
                }]
            }],
            "summary": {
                "total_sources": 1,
                "fact_checking_sites_queried": 10
            }
        }
    })

class SourceType(str, Enum):
    FACT_CHECKER = "fact_checker"
    NEWS_SITE = "news_site"

class FactCheckSource(BaseModel):
    domain: str
    type: SourceType
    priority: int = Field(default=1, ge=1, le=10)

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "domain": "factcheck.org",
            "type": "fact_checker",
            "priority": 1
        }
    })

# Sources configuration with validation
SOURCES = {
    "fact_checkers": [
        FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
        for domain in [
            "factcheck.org",
            "snopes.com",
            "politifact.com",
            "reuters.com",
            "bbc.com",
            "apnews.com",
            "usatoday.com",
            "nytimes.com",
            "washingtonpost.com",
            "afp.com",
            "fullfact.org",
            "truthorfiction.com",
            "leadstories.com",
            "altnews.in",
            "boomlive.in",
            "en.prothomalo.com"
        ]
    ],
    "news_sites": [
        FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
        for domain in [
            "www.thedailystar.net",
            "www.thefinancialexpress.com.bd",
            "www.theindependentbd.com",
            "www.dhakatribune.com",
            "www.newagebd.net",
            "www.observerbd.com",
            "www.daily-sun.com",
            "www.tbsnews.net",
            "www.businesspostbd.com",
            "www.banglanews24.com/english",
            "www.bdnews24.com/english",
            "www.risingbd.com/english",
            "www.dailyindustry.news",
            "www.bangladeshpost.net",
            "www.daily-bangladesh.com/english"
        ]
    ]
}

class FactCheckRequest(BaseModel):
    content: str = Field(
        ...,
        min_length=10,
        max_length=1000,
        description="The claim to be fact-checked"
    )
    language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
    max_results_per_source: int = Field(default=10, ge=1, le=50)

    @validator('content')
    def validate_content(cls, v):
        if not v.strip():
            raise ValueError("Content cannot be empty or just whitespace")
        return v.strip()

async def fetch_fact_checks(
    api_key: str,
    base_url: str,
    query: str,
    site: FactCheckSource
) -> Dict:
    """
    Fetch fact checks from a specific site using the Google Fact Check API
    """
    try:
        if not api_key or not base_url:
            raise ValueError("API key or base URL not configured")

        params = {
            "key": api_key,
            "query": query,
            "languageCode": "en-US",
            "reviewPublisherSiteFilter": site.domain,
            "pageSize": 10
        }

        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        raise HTTPException(
            status_code=503,
            detail=ErrorResponse(
                detail=f"Error fetching from {site.domain}: {str(e)}",
                error_code="FACT_CHECK_SERVICE_ERROR",
                path="/check-facts"
            ).dict()
        )
    except ValueError as e:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail=str(e),
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )

@fact_check_router.post(
    "/check-facts",
    response_model=FactCheckResponse,
    responses={
        400: {"model": ErrorResponse},
        404: {"model": ErrorResponse},
        500: {"model": ErrorResponse},
        503: {"model": ErrorResponse}
    }
)
async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
    """
    Check facts using multiple fact-checking sources
    """
    all_results = []

    # Validate configuration
    if not GOOGLE_FACT_CHECK_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail="API configuration is missing",
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )

    # Check all sources in priority order
    all_sources = (
        SOURCES["fact_checkers"] +
        SOURCES["news_sites"]
    )
    all_sources.sort(key=lambda x: x.priority)

    for source in all_sources:
        try:
            result = await fetch_fact_checks(
                GOOGLE_FACT_CHECK_API_KEY,
                GOOGLE_FACT_CHECK_BASE_URL,
                request.content,
                source
            )

            if "claims" in result:
                # Validate each claim through Pydantic
                validated_claims = [
                    Claim(**claim).dict()
                    for claim in result["claims"]
                ]
                all_results.extend(validated_claims)

        except HTTPException:
            raise
        except Exception as e:
            # Log the error but continue with other sources
            print(f"Error processing {source.domain}: {str(e)}")
            continue

    if not all_results:
        raise HTTPException(
            status_code=404,
            detail=ErrorResponse(
                detail="No fact check results found",
                error_code="NO_RESULTS_FOUND",
                path="/check-facts"
            ).dict()
        )

    # Create the response using Pydantic model
    response = FactCheckResponse(
        query=request.content,
        total_claims_found=len(all_results),
        results=all_results,
        summary={
            "total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "")
                                 for claim in all_results if claim.get("claimReview"))),
            "fact_checking_sites_queried": len(all_sources)
        }
    )

    return response