fact-checker-backend/app/models/fact_check_models.py

from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from datetime import datetime
from urllib.parse import urlparse

# Common Models
class TokenUsage(BaseModel):
    prompt_tokens: Optional[int] = 0
    completion_tokens: Optional[int] = 0
    total_tokens: Optional[int] = 0

class ErrorResponse(BaseModel):
    detail: str
    error_code: str = Field(..., description="Unique error code for this type of error")
    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
    path: Optional[str] = Field(None, description="The endpoint path where error occurred")

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "detail": "Error description",
            "error_code": "ERROR_CODE",
            "timestamp": "2024-12-09T16:49:30.905765",
            "path": "/check-facts"
        }
    })

# Fact Check Models
class Publisher(BaseModel):
    name: str
    site: Optional[str] = Field(None, description="Publisher's website")

    @validator('site')
    def validate_site(cls, v):
        if v and not (v.startswith('http://') or v.startswith('https://')):
            return f"https://{v}"
        return v

class ClaimReview(BaseModel):
    publisher: Publisher
    url: Optional[HttpUrl] = None
    title: Optional[str] = None
    reviewDate: Optional[str] = None
    textualRating: Optional[str] = None
    languageCode: str = Field(default="en-US")

class Claim(BaseModel):
    text: str
    claimant: Optional[str] = None
    claimDate: Optional[str] = None
    claimReview: List[ClaimReview]

class SourceType(str, Enum):
    FACT_CHECKER = "fact_checker"
    NEWS_SITE = "news_site"

class FactCheckSource(BaseModel):
    domain: str
    type: SourceType
    priority: int = Field(default=1, ge=1, le=10)

# Verification Models
class VerificationResult(BaseModel):
    verdict: str = Field(..., description="True/False/Insufficient Information")
    confidence: str = Field(..., description="High/Medium/Low")
    evidence: Union[str, List[str]]
    reasoning: str
    missing_info: Optional[str] = None

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "verdict": "True",
            "confidence": "High",
            "evidence": ["Direct quote from source supporting the claim"],
            "reasoning": "Detailed analysis of why the claim is considered true",
            "missing_info": "Any caveats or limitations of the verification"
        }
    })

# Request Models
class BaseFactCheckRequest(BaseModel):
    content: str = Field(
        ...,
        min_length=10,
        max_length=1000,
        description="The claim to be fact-checked"
    )

    @validator('content')
    def validate_content(cls, v):
        if not v.strip():
            raise ValueError("Content cannot be empty or just whitespace")
        return v.strip()

class GoogleFactCheckRequest(BaseFactCheckRequest):
    language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
    max_results_per_source: int = Field(default=10, ge=1, le=50)

class AIFactCheckRequest(BaseFactCheckRequest):
    urls: List[str] = Field(
        ...,
        min_items=1,
        max_items=5,
        description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
    )

    @validator('urls')
    def validate_urls(cls, urls):
        validated_urls = []
        for url in urls:
            if not url.strip():
                raise ValueError("URL cannot be empty")

            # Add https:// if no protocol specified
            if not url.startswith(('http://', 'https://')):
                url = f'https://{url}'

            try:
                result = urlparse(url)
                if not result.netloc:
                    raise ValueError(f"Invalid URL structure for {url}")
                validated_urls.append(url)
            except Exception as e:
                raise ValueError(f"Invalid URL {url}: {str(e)}")

        return validated_urls

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "content": "Indian flag was drawn in BUET campus",
            "urls": [
                "www.altnews.in/article-about-flag",
                "www.another-source.com/related-news"
            ]
        }
    })

# Response Models
class BaseFactCheckResponse(BaseModel):
    query: str
    token_usage: TokenUsage
    sources: List[str]
    context_used: List[str]

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example statement to verify",
            "token_usage": {
                "prompt_tokens": 100,
                "completion_tokens": 50,
                "total_tokens": 150
            },
            "sources": ["source1.com", "source2.com"],
            "context_used": ["Relevant context from sources"]
        }
    })

class GoogleFactCheckResponse(BaseFactCheckResponse):
    total_claims_found: int
    results: List[Dict[str, Any]]
    verification_result: Dict[str, Any]
    summary: Dict[str, int]

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example claim",
            "total_claims_found": 1,
            "results": [{
                "text": "Example claim text",
                "claimant": "Source name",
                "claimReview": [{
                    "publisher": {
                        "name": "Fact Checker",
                        "site": "factchecker.com"
                    },
                    "textualRating": "True"
                }]
            }],
            "verification_result": {
                "verdict": "True",
                "confidence": "High",
                "evidence": ["Supporting evidence"],
                "reasoning": "Detailed analysis"
            },
            "sources": ["factchecker.com"],
            "context_used": ["Relevant context"],
            "token_usage": {
                "prompt_tokens": 100,
                "completion_tokens": 50,
                "total_tokens": 150
            },
            "summary": {
                "total_sources": 1,
                "fact_checking_sites_queried": 10
            }
        }
    })

class AIFactCheckResponse(BaseFactCheckResponse):
    verification_result: Dict[str, VerificationResult]  # Changed to Dict to store results per URL

    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Indian flag was drawn in BUET campus",
            "verification_result": {
                "https://www.source1.com": {
                    "verdict": "True",
                    "confidence": "High",
                    "evidence": ["Supporting evidence from source 1"],
                    "reasoning": "Detailed analysis from source 1",
                    "missing_info": None
                },
                "https://www.source2.com": {
                    "verdict": "True",
                    "confidence": "Medium",
                    "evidence": ["Supporting evidence from source 2"],
                    "reasoning": "Analysis from source 2",
                    "missing_info": "Additional context needed"
                }
            },
            "sources": ["source1.com", "source2.com"],
            "context_used": [
                "Context from source 1",
                "Context from source 2"
            ],
            "token_usage": {
                "prompt_tokens": 200,
                "completion_tokens": 100,
                "total_tokens": 300
            }
        }
    })

# Backwards compatibility aliases
FactCheckRequest = GoogleFactCheckRequest
FactCheckResponse = GoogleFactCheckResponse