2024-12-17 11:33:44 +00:00
18 changed files with 1263 additions and 267 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
 env
 .env
 test.py
-/__pycache__/
+__pycache__
--- a/app/pycache/config.cpython-312.pyc
+++ b/app/pycache/config.cpython-312.pyc
--- a/app/api/pycache/fact_check.cpython-312.pyc
+++ b/app/api/pycache/fact_check.cpython-312.pyc
--- a/app/api/ai_fact_check.py
+++ b/app/api/ai_fact_check.py
@ -0,0 +1,110 @@
 from fastapi import APIRouter, HTTPException
 from app.services.openai_client import OpenAIClient, AIFactChecker
 from app.config import OPENAI_API_KEY
 from app.models.ai_fact_check_models import (
    AIFactCheckRequest,
    AIFactCheckResponse,
    VerificationResult,
    TokenUsage,
    ErrorResponse
 )
 from urllib.parse import urlparse
 import asyncio
 # Initialize router and OpenAI client
 aifact_check_router = APIRouter()
 openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
 fact_checker = AIFactChecker(openai_client=openai_client)
@aifact_check_router.post(
    "/aicheck-facts",
    response_model=AIFactCheckResponse,
    responses={
        400: {"model": ErrorResponse},
        500: {"model": ErrorResponse}
    }
 )
 async def ai_fact_check(request: AIFactCheckRequest):
    """
    Endpoint to fact-check a given statement based on multiple webpage URLs.
    Input:
        - urls: List of webpage URLs to analyze (with or without http/https)
        - content: The fact statement to verify
    Response:
        - JSON response with verification results per URL, sources, and token usage
    """
    try:
        results = {}
        all_sources = set()
        all_contexts = []
        total_prompt_tokens = 0
        total_completion_tokens = 0
        total_tokens = 0
        # Process all URLs concurrently
        tasks = [
            fact_checker.check_fact(url=url, query=request.content)
            for url in request.urls
        ]
        fact_check_results = await asyncio.gather(*tasks, return_exceptions=True)
        # Process results
        for url, result in zip(request.urls, fact_check_results):
            if isinstance(result, Exception):
                # Handle failed URL checks
                results[url] = VerificationResult(
                    verdict="Error",
                    confidence="Low",
                    evidence=f"Error checking URL: {str(result)}",
                    reasoning="URL processing failed",
                    missing_info="Could not access or process the URL"
                )
                continue
            verification_result = VerificationResult(
                verdict=result["verification_result"]["verdict"],
                confidence=result["verification_result"]["confidence"],
                evidence=result["verification_result"]["evidence"],
                reasoning=result["verification_result"]["reasoning"],
                missing_info=result["verification_result"].get("missing_info", None)
            )
            results[url] = verification_result
            all_sources.update(result["sources"])
            # Accumulate token usage
            total_prompt_tokens += result["token_usage"]["prompt_tokens"]
            total_completion_tokens += result["token_usage"]["completion_tokens"]
            total_tokens += result["token_usage"]["total_tokens"]
        token_usage = TokenUsage(
            prompt_tokens=total_prompt_tokens,
            completion_tokens=total_completion_tokens,
            total_tokens=total_tokens
        )
        return AIFactCheckResponse(
            query=request.content,
            verification_result=results,
            sources=list(all_sources),
            token_usage=token_usage
        )
    except ValueError as e:
        raise HTTPException(
            status_code=400,
            detail=ErrorResponse(
                detail=str(e),
                error_code="INVALID_URL",
                path="/aicheck-facts"
            ).dict()
        )
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail=f"Error processing fact-check request: {str(e)}",
                error_code="PROCESSING_ERROR",
                path="/aicheck-facts"
            ).dict()
        )
--- a/app/api/fact_check.py
+++ b/app/api/fact_check.py
@ -1,291 +1,192 @@
 from fastapi import APIRouter, HTTPException
-from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
+import httpx
-from typing import Dict, List, Optional, Union
+from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
-import requests
+from app.api.scrap_websites import search_websites, SearchRequest
-from enum import Enum
+from app.services.openai_client import OpenAIClient
-from datetime import datetime
+from app.models.fact_check_models import (
-import json
+    FactCheckRequest, 
-from app.config import GOOGLE_FACT_CHECK_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
+    FactCheckResponse, 
    ErrorResponse,
    Source
 )
 from app.websites.fact_checker_website import get_all_sources
 fact_check_router = APIRouter()
 openai_client = OpenAIClient(OPENAI_API_KEY)
-class CustomJSONEncoder(json.JSONEncoder):
+async def generate_fact_report(query: str, fact_check_data: dict) -> FactCheckResponse:
-    def default(self, obj):
+    """Generate a fact check report using OpenAI based on the fact check results."""
-        if isinstance(obj, datetime):
+    try:
-            return obj.isoformat()
+        base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.
        return super().default(obj)
-class ErrorResponse(BaseModel):
+Rules:
-    detail: str
+1. Include all source URLs and names in the sources list
-    error_code: str = Field(..., description="Unique error code for this type of error")
+2. Keep the explanation focused on verifiable facts
-    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
+3. Include dates when available
-    path: Optional[str] = Field(None, description="The endpoint path where error occurred")
+4. Maintain objectivity in the report"""
-    model_config = ConfigDict(json_schema_extra={
+        base_user_prompt = """Generate a comprehensive fact check report in this exact JSON format:
-        "example": {
+{
-            "detail": "Error description",
+    "claim": "Write the exact claim being verified",
-            "error_code": "ERROR_CODE",
+    "verdict": "One of: True/False/Partially True/Unverified",
-            "timestamp": "2024-12-09T16:49:30.905765",
+    "confidence": "One of: High/Medium/Low",
-            "path": "/check-facts"
+    "sources": [
        {
            "url": "Full URL of the source",
            "name": "Name of the source organization"
        }
    })
 class RequestValidationError(BaseModel):
    loc: List[str]
    msg: str
    type: str
 class Publisher(BaseModel):
    name: str
    site: Optional[str] = Field(None, description="Publisher's website")
    @validator('site')
    def validate_site(cls, v):
        if v and not (v.startswith('http://') or v.startswith('https://')):
            return f"https://{v}"
        return v
 class ClaimReview(BaseModel):
    publisher: Publisher
    url: Optional[HttpUrl] = None
    title: Optional[str] = None
    reviewDate: Optional[str] = None
    textualRating: Optional[str] = None
    languageCode: str = Field(default="en-US")
 class Claim(BaseModel):
    text: str
    claimant: Optional[str] = None
    claimDate: Optional[str] = None
    claimReview: List[ClaimReview]
 class FactCheckResponse(BaseModel):
    query: str = Field(..., description="Original query that was fact-checked")
    total_claims_found: int = Field(..., ge=0)
    results: List[Claim] = Field(default_factory=list)
    summary: Dict[str, int] = Field(...)
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example claim",
            "total_claims_found": 1,
            "results": [{
                "text": "Example claim text",
                "claimant": "Source name",
                "claimReview": [{
                    "publisher": {
                        "name": "Fact Checker",
                        "site": "factchecker.com"
                    },
                    "textualRating": "True"
                }]
            }],
            "summary": {
                "total_sources": 1,
                "fact_checking_sites_queried": 10
            }
        }
    })
 class SourceType(str, Enum):
    FACT_CHECKER = "fact_checker"
    NEWS_SITE = "news_site"
 class FactCheckSource(BaseModel):
    domain: str
    type: SourceType
    priority: int = Field(default=1, ge=1, le=10)
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "domain": "factcheck.org",
            "type": "fact_checker",
            "priority": 1
        }
    })
 # Sources configuration with validation
 SOURCES = {
    "fact_checkers": [
        FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
        for domain in [
            "factcheck.org",
            "snopes.com",
            "politifact.com",
            "reuters.com",
            "bbc.com",
            "apnews.com",
            "usatoday.com",
            "nytimes.com",
            "washingtonpost.com",
            "afp.com",
            "fullfact.org",
            "truthorfiction.com",
            "leadstories.com",
            "altnews.in",
            "boomlive.in",
            "en.prothomalo.com"
        ]
    ],
-    "news_sites": [
+    "evidence": "A concise summary of the key evidence (1-2 sentences)",
-        FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
+    "explanation": "A detailed explanation including who verified it, when it was verified, and the key findings (2-3 sentences)",
-        for domain in [
+    "additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
            "www.thedailystar.net",
            "www.thefinancialexpress.com.bd",
            "www.theindependentbd.com",
            "www.dhakatribune.com",
            "www.newagebd.net",
            "www.observerbd.com",
            "www.daily-sun.com",
            "www.tbsnews.net",
            "www.businesspostbd.com",
            "www.banglanews24.com/english",
            "www.bdnews24.com/english",
            "www.risingbd.com/english",
            "www.dailyindustry.news",
            "www.bangladeshpost.net",
            "www.daily-bangladesh.com/english"
        ]
    ]
 }
-class FactCheckRequest(BaseModel):
+Ensure all URLs in sources are complete (including https:// if missing) and each source has both a URL and name."""
-    content: str = Field(
+
-        ...,
+        if "claims" in fact_check_data:
-        min_length=10,
+            system_prompt = base_system_prompt
-        max_length=1000,
+            user_prompt = f"""Query: {query}
-        description="The claim to be fact-checked"
+            Fact Check Results: {fact_check_data}
            {base_user_prompt}
            The report should:
            1. Include ALL source URLs and organization names
            2. Specify verification dates when available 
            3. Name the fact-checking organizations involved
            4. Describe the verification process"""
        else:
            system_prompt = base_system_prompt
            user_prompt = f"""Query: {query}
            Fact Check Results: {fact_check_data}
            {base_user_prompt}
            The report should:
            1. Include ALL source URLs and names from both verification_result and sources fields
            2. Mention all fact-checking organizations involved
            3. Describe the verification process
            4. Note any conflicting information between sources"""
        response = await openai_client.generate_text_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            max_tokens=1000
        )
    language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
    max_results_per_source: int = Field(default=10, ge=1, le=50)
    @validator('content')
    def validate_content(cls, v):
        if not v.strip():
            raise ValueError("Content cannot be empty or just whitespace")
        return v.strip()
 async def fetch_fact_checks(
    api_key: str,
    base_url: str,
    query: str,
    site: FactCheckSource
 ) -> Dict:
    """
    Fetch fact checks from a specific site using the Google Fact Check API
    """
        try:
-        if not api_key or not base_url:
+            # First try to parse the response directly
-            raise ValueError("API key or base URL not configured")
+            response_data = response["response"]
            # Clean up sources before validation
            if isinstance(response_data.get('sources'), list):
                cleaned_sources = []
                for source in response_data['sources']:
                    if isinstance(source, str):
                        # Convert string sources to Source objects
                        url = source if source.startswith('http') else f"https://{source}"
                        cleaned_sources.append({
                            "url": url,
                            "name": source
                        })
                    elif isinstance(source, dict):
                        # Ensure URL has proper scheme
                        url = source.get('url', '')
                        if url and not url.startswith('http'):
                            source['url'] = f"https://{url}"
                        cleaned_sources.append(source)
                response_data['sources'] = cleaned_sources
            fact_check_response = FactCheckResponse(**response_data)
            return fact_check_response
        except Exception as validation_error:
            print(f"Response validation error: {str(validation_error)}")
            raise HTTPException(
                status_code=422,
                detail=ErrorResponse(
                    detail=f"Invalid response format: {str(validation_error)}",
                    error_code="VALIDATION_ERROR",
                    path="/check-facts"
                ).dict()
            )
    except Exception as e:
        print(f"Error generating fact report: {str(e)}")
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail="Error generating fact report",
                error_code="FACT_CHECK_ERROR",
                path="/check-facts"
            ).dict()
        )
@fact_check_router.post("/check-facts", response_model=FactCheckResponse)
 async def check_facts(request: FactCheckRequest):
    """
    Fetch fact check results and generate a comprehensive report.
    """
    if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail="Google API key or base URL is not configured",
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )
    headers = {"Content-Type": "application/json"}
    async with httpx.AsyncClient() as client:
        # Get fact checker sources from the centralized configuration
        fact_checker_sources = get_all_sources()
        for source in fact_checker_sources:
            params = {
-            "key": api_key,
+                "key": GOOGLE_API_KEY,
-            "query": query,
+                "query": request.query,
                "languageCode": "en-US",
-            "reviewPublisherSiteFilter": site.domain,
+                "reviewPublisherSiteFilter": source.domain,
                "pageSize": 10
            }
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        raise HTTPException(
            status_code=503,
            detail=ErrorResponse(
                detail=f"Error fetching from {site.domain}: {str(e)}",
                error_code="FACT_CHECK_SERVICE_ERROR",
                path="/check-facts"
            ).dict()
        )
    except ValueError as e:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail=str(e),
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )
@fact_check_router.post(
    "/check-facts",
    response_model=FactCheckResponse,
    responses={
        400: {"model": ErrorResponse},
        404: {"model": ErrorResponse},
        500: {"model": ErrorResponse},
        503: {"model": ErrorResponse}
    }
 )
 async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
    """
    Check facts using multiple fact-checking sources
    """
    all_results = []
    # Validate configuration
    if not GOOGLE_FACT_CHECK_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail="API configuration is missing",
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )
    # Check all sources in priority order
    all_sources = (
        SOURCES["fact_checkers"] +
        SOURCES["news_sites"]
    )
    all_sources.sort(key=lambda x: x.priority)
    for source in all_sources:
            try:
-            result = await fetch_fact_checks(
+                response = await client.get(
                GOOGLE_FACT_CHECK_API_KEY,
                    GOOGLE_FACT_CHECK_BASE_URL,
-                request.content,
+                    params=params,
-                source
+                    headers=headers
                )
                response.raise_for_status()
                json_response = response.json()
-            if "claims" in result:
+                if json_response.get("claims"):
-                # Validate each claim through Pydantic
+                    return await generate_fact_report(request.query, json_response)
                validated_claims = [
                    Claim(**claim).dict()
                    for claim in result["claims"]
                ]
                all_results.extend(validated_claims)
-        except HTTPException:
+            except httpx.RequestError as e:
-            raise
+                print(f"Error fetching results for site {source.domain}: {str(e)}")
                continue
            except Exception as e:
-            # Log the error but continue with other sources
+                print(f"Unexpected error for site {source.domain}: {str(e)}")
            print(f"Error processing {source.domain}: {str(e)}")
                continue
-    if not all_results:
+        try:
            search_request = SearchRequest(
                search_text=request.query,
                source_types=["fact_checkers"]
            )
            ai_response = await search_websites(search_request)
            return await generate_fact_report(request.query, ai_response)
        except Exception as e:
            print(f"Error in AI fact check: {str(e)}")
            raise HTTPException(
                status_code=404,
                detail=ErrorResponse(
                    detail="No fact check results found",
-                error_code="NO_RESULTS_FOUND",
+                    error_code="NOT_FOUND",
                    path="/check-facts"
                ).dict()
            )
    # Create the response using Pydantic model
    response = FactCheckResponse(
        query=request.content,
        total_claims_found=len(all_results),
        results=all_results,
        summary={
            "total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "") 
                                 for claim in all_results if claim.get("claimReview"))),
            "fact_checking_sites_queried": len(all_sources)
        }
    )
    return response
--- a/app/api/scrap_websites.py
+++ b/app/api/scrap_websites.py
@ -0,0 +1,160 @@
 from fastapi import APIRouter, HTTPException
 import httpx
 import logging
 from urllib.parse import urlparse
 from typing import List, Dict, Optional
 from pydantic import BaseModel
 from app.models.ai_fact_check_models import (
    AIFactCheckRequest,
    FactCheckSource,
    SourceType
 )
 from app.websites.fact_checker_website import SOURCES, get_all_sources
 from app.api.ai_fact_check import ai_fact_check
 from app.config import GOOGLE_API_KEY, GOOGLE_ENGINE_ID, GOOGLE_SEARCH_URL
 class SearchRequest(BaseModel):
    search_text: str
    source_types: List[str] = ["fact_checkers"]
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 scrap_websites_router = APIRouter()
 # Constants
 RESULTS_PER_PAGE = 10
 MAX_PAGES = 5
 MAX_URLS_PER_DOMAIN = 5
 def get_domain_from_url(url: str) -> str:
    """Extract domain from URL with improved handling."""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        logger.error(f"Error extracting domain from URL {url}: {str(e)}")
        return ""
 def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
    """Check if domain matches any source with improved matching logic."""
    if not domain:
        return False
    domain = domain.lower()
    if domain.startswith('www.'):
        domain = domain[4:]
    for source in sources:
        source_domain = source.domain.lower()
        if source_domain.startswith('www.'):
            source_domain = source_domain[4:]
        if domain == source_domain or domain.endswith('.' + source_domain):
            return True
    return False
 async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
    """Build search query with site restrictions."""
    site_queries = [f"site:{source.domain}" for source in sources]
    site_restriction = " OR ".join(site_queries)
    return f"({query}) ({site_restriction})"
 async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
    """Perform Google Custom Search with enhanced query."""
    enhanced_query = await build_enhanced_search_query(query, sources)
    start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
    params = {
        "key": GOOGLE_API_KEY,
        "cx": GOOGLE_ENGINE_ID,
        "q": enhanced_query,
        "num": RESULTS_PER_PAGE,
        "start": start_index
    }
    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            response = await client.get(GOOGLE_SEARCH_URL, params=params)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Search error: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
@scrap_websites_router.post("/search")
 async def search_websites(request: SearchRequest):
    # Get the source types from the request
    source_types = request.source_types if request.source_types else ["fact_checkers"]
    # Get sources based on requested types
    selected_sources = []
    for source_type in source_types:
        if source_type in SOURCES:
            selected_sources.extend(SOURCES[source_type])
    # If no valid sources found, use fact checkers as default
    if not selected_sources:
        selected_sources = SOURCES["fact_checkers"]
    all_urls = []
    domain_results = {}
    try:
        for page in range(1, MAX_PAGES + 1):
            if len(all_urls) >= 50:
                break
            search_response = await google_custom_search(request.search_text, selected_sources, page)
            if not search_response or not search_response.get("items"):
                break
            for item in search_response.get("items", []):
                url = item.get("link")
                if not url:
                    continue
                domain = get_domain_from_url(url)
                if is_valid_source_domain(domain, selected_sources):
                    if domain not in domain_results:
                        domain_results[domain] = []
                    if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
                        domain_results[domain].append({
                            "url": url,
                            "title": item.get("title", ""),
                            "snippet": item.get("snippet", "")
                        })
                        all_urls.append(url)
                if len(all_urls) >= 50:
                    break
        if not all_urls:
            return {
                "status": "no_results",
                "urls_found": 0
            }
        fact_check_request = AIFactCheckRequest(
            content=request.search_text,
            urls=all_urls[:5]
        )
        return await ai_fact_check(fact_check_request)
    except Exception as e:
        logger.error(f"Error during search/fact-check process: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
--- a/app/config.py
+++ b/app/config.py
@ -3,8 +3,10 @@ from dotenv import load_dotenv
 load_dotenv()
-GOOGLE_FACT_CHECK_API_KEY = os.environ["GOOGLE_FACT_CHECK_API_KEY"]
+GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
 GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
 GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
 GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"]
 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
 FRONTEND_URL = os.environ["FRONTEND_URL"]
--- a/app/models/pycache/fact_check_models.cpython-312.pyc
+++ b/app/models/pycache/fact_check_models.cpython-312.pyc
--- a/app/models/ai_fact_check_models.py
+++ b/app/models/ai_fact_check_models.py
@ -0,0 +1,229 @@
 from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
 from typing import Dict, List, Optional, Any, Union
 from enum import Enum
 from datetime import datetime
 from urllib.parse import urlparse
 # Common Models
 class TokenUsage(BaseModel):
    prompt_tokens: Optional[int] = 0
    completion_tokens: Optional[int] = 0
    total_tokens: Optional[int] = 0
 class ErrorResponse(BaseModel):
    detail: str
    error_code: str = Field(..., description="Unique error code for this type of error")
    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
    path: Optional[str] = Field(None, description="The endpoint path where error occurred")
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "detail": "Error description",
            "error_code": "ERROR_CODE",
            "timestamp": "2024-12-09T16:49:30.905765",
            "path": "/check-facts"
        }
    })
 # Fact Check Models
 class Publisher(BaseModel):
    name: str
    site: Optional[str] = Field(None, description="Publisher's website")
    @validator('site')
    def validate_site(cls, v):
        if v and not (v.startswith('http://') or v.startswith('https://')):
            return f"https://{v}"
        return v
 class ClaimReview(BaseModel):
    publisher: Publisher
    url: Optional[HttpUrl] = None
    title: Optional[str] = None
    reviewDate: Optional[str] = None
    textualRating: Optional[str] = None
    languageCode: str = Field(default="en-US")
 class Claim(BaseModel):
    text: str
    claimant: Optional[str] = None
    claimDate: Optional[str] = None
    claimReview: List[ClaimReview]
 class SourceType(str, Enum):
    FACT_CHECKER = "fact_checker"
    NEWS_SITE = "news_site"
 class FactCheckSource(BaseModel):
    domain: str
    type: SourceType
    priority: int = Field(default=1, ge=1, le=10)
 # Verification Models
 class VerificationResult(BaseModel):
    verdict: str = Field(..., description="True/False/Insufficient Information")
    confidence: str = Field(..., description="High/Medium/Low")
    evidence: Union[str, List[str]]
    reasoning: str
    missing_info: Optional[str] = None
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "verdict": "True",
            "confidence": "High",
            "evidence": ["Direct quote from source supporting the claim"],
            "reasoning": "Detailed analysis of why the claim is considered true",
            "missing_info": "Any caveats or limitations of the verification"
        }
    })
 # Request Models
 class BaseFactCheckRequest(BaseModel):
    content: str = Field(
        ...,
        min_length=10,
        max_length=1000,
        description="The claim to be fact-checked"
    )
    @validator('content')
    def validate_content(cls, v):
        if not v.strip():
            raise ValueError("Content cannot be empty or just whitespace")
        return v.strip()
 class GoogleFactCheckRequest(BaseFactCheckRequest):
    language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
    max_results_per_source: int = Field(default=10, ge=1, le=50)
 class AIFactCheckRequest(BaseFactCheckRequest):
    urls: List[str] = Field(
        ...,
        min_items=1,
        max_items=5,
        description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
    )
    @validator('urls')
    def validate_urls(cls, urls):
        validated_urls = []
        for url in urls:
            if not url.strip():
                raise ValueError("URL cannot be empty")
            # Add https:// if no protocol specified
            if not url.startswith(('http://', 'https://')):
                url = f'https://{url}'
            try:
                result = urlparse(url)
                if not result.netloc:
                    raise ValueError(f"Invalid URL structure for {url}")
                validated_urls.append(url)
            except Exception as e:
                raise ValueError(f"Invalid URL {url}: {str(e)}")
        return validated_urls
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "content": "Indian flag was drawn in BUET campus",
            "urls": [
                "www.altnews.in/article-about-flag",
                "www.another-source.com/related-news"
            ]
        }
    })
 # Response Models
 class BaseFactCheckResponse(BaseModel):
    query: str
    token_usage: TokenUsage
    sources: List[str]
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example statement to verify",
            "token_usage": {
                "prompt_tokens": 100,
                "completion_tokens": 50,
                "total_tokens": 150
            },
            "sources": ["source1.com", "source2.com"],
        }
    })
 class GoogleFactCheckResponse(BaseFactCheckResponse):
    total_claims_found: int
    results: List[Dict[str, Any]]
    verification_result: Dict[str, Any]
    summary: Dict[str, int]
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Example claim",
            "total_claims_found": 1,
            "results": [{
                "text": "Example claim text",
                "claimant": "Source name",
                "claimReview": [{
                    "publisher": {
                        "name": "Fact Checker",
                        "site": "factchecker.com"
                    },
                    "textualRating": "True"
                }]
            }],
            "verification_result": {
                "verdict": "True",
                "confidence": "High",
                "evidence": ["Supporting evidence"],
                "reasoning": "Detailed analysis"
            },
            "sources": ["factchecker.com"],
            "token_usage": {
                "prompt_tokens": 100,
                "completion_tokens": 50,
                "total_tokens": 150
            },
            "summary": {
                "total_sources": 1,
                "fact_checking_sites_queried": 10
            }
        }
    })
 class AIFactCheckResponse(BaseFactCheckResponse):
    verification_result: Dict[str, VerificationResult]  # Changed to Dict to store results per URL
    model_config = ConfigDict(json_schema_extra={
        "example": {
            "query": "Indian flag was drawn in BUET campus",
            "verification_result": {
                "https://www.source1.com": {
                    "verdict": "True",
                    "confidence": "High",
                    "evidence": ["Supporting evidence from source 1"],
                    "reasoning": "Detailed analysis from source 1",
                    "missing_info": None
                },
                "https://www.source2.com": {
                    "verdict": "True",
                    "confidence": "Medium",
                    "evidence": ["Supporting evidence from source 2"],
                    "reasoning": "Analysis from source 2",
                    "missing_info": "Additional context needed"
                }
            },
            "sources": ["source1.com", "source2.com"],
            "token_usage": {
                "prompt_tokens": 200,
                "completion_tokens": 100,
                "total_tokens": 300
            }
        }
    })
 # Backwards compatibility aliases
 FactCheckRequest = GoogleFactCheckRequest
 FactCheckResponse = GoogleFactCheckResponse
--- a/app/models/fact_check_models.py
+++ b/app/models/fact_check_models.py
@ -0,0 +1,101 @@
 from pydantic import BaseModel, Field, HttpUrl, validator
 from typing import List, Literal, Union
 from datetime import datetime
 from enum import Enum
 class VerdictEnum(str, Enum):
    TRUE = "True"
    FALSE = "False"
    PARTIALLY_TRUE = "Partially True"
    UNVERIFIED = "Unverified"
 class ConfidenceEnum(str, Enum):
    HIGH = "High"
    MEDIUM = "Medium"
    LOW = "Low"
 class FactCheckRequest(BaseModel):
    query: str = Field(
        ...,
        min_length=3,
        max_length=500,
        description="The claim or statement to be fact-checked",
        example="Did NASA confirm finding alien structures on Mars in 2024?"
    )
 class Source(BaseModel):
    url: str
    name: str = ""
    @validator('url')
    def validate_url(cls, v):
        # Basic URL validation without requiring HTTP/HTTPS
        if not v or len(v) < 3:
            raise ValueError("URL must not be empty and must be at least 3 characters")
        return v
 class FactCheckResponse(BaseModel):
    claim: str = Field(
        ...,
        min_length=10,
        max_length=1000,
        description="The exact claim being verified"
    )
    verdict: VerdictEnum = Field(
        ...,
        description="The verification verdict"
    )
    confidence: ConfidenceEnum = Field(
        ...,
        description="Confidence level in the verdict"
    )
    sources: List[Source] = Field(
        ...,
        min_items=1,
        description="List of sources used in verification"
    )
    evidence: str = Field(
        ...,
        min_length=20,
        max_length=500,
        description="Concise summary of key evidence"
    )
    explanation: str = Field(
        ...,
        min_length=50,
        max_length=1000,
        description="Detailed explanation of verification findings"
    )
    additional_context: str = Field(
        ...,
        min_length=20,
        max_length=500,
        description="Important context about the verification"
    )
    class Config:
        json_schema_extra = {
            "example": {
                "claim": "NASA confirmed finding alien structures on Mars in 2024",
                "verdict": "False",
                "confidence": "High",
                "sources": [
                    {
                        "url": "https://www.nasa.gov/mars-exploration",
                        "name": "NASA Mars Exploration"
                    },
                    {
                        "url": "https://factcheck.org/2024/mars-claims",
                        "name": "FactCheck.org"
                    }
                ],
                "evidence": "NASA has made no such announcement. Recent Mars rover images show natural rock formations.",
                "explanation": "Multiple fact-checking organizations investigated this claim. NASA's official communications and Mars mission reports from 2024 contain no mention of alien structures. The viral images being shared are misidentified natural geological formations.",
                "additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images."
            }
        }
 class ErrorResponse(BaseModel):
    detail: str
    error_code: str = Field(..., example="VALIDATION_ERROR")
    path: str = Field(..., example="/check-facts")
--- a/app/models/scrap_websites_models.py
+++ b/app/models/scrap_websites_models.py
@ -0,0 +1,43 @@
 from pydantic import BaseModel
 from typing import List, Dict
 class SearchRequest(BaseModel):
    search_text: str
    source_types: List[str] = ["fact_checkers"]
 class Publisher(BaseModel):
    name: str
    site: str
 class ClaimReview(BaseModel):
    publisher: Publisher
    textualRating: str
 class Claim(BaseModel):
    claimReview: List[ClaimReview]
    claimant: str
    text: str
 class Summary(BaseModel):
    fact_checking_sites_queried: int
    total_sources: int
 class TokenUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class VerificationResult(BaseModel):
    verdict: str
    confidence: str
    evidence: List[str]
    reasoning: str
 class EnhancedFactCheckResponse(BaseModel):
    query: str
    results: List[Claim]
    sources: List[str]
    summary: Summary
    token_usage: Dict[str, int]
    total_claims_found: int
    verification_result: VerificationResult
--- a/app/services/openai_client.py
+++ b/app/services/openai_client.py
@ -0,0 +1,172 @@
 from langchain_community.document_loaders import AsyncHtmlLoader
 from langchain_community.document_transformers import BeautifulSoupTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from typing import List, Dict, Any
 import numpy as np
 import logging as logger
 import openai
 import json
 class OpenAIClient:
    def __init__(self, api_key: str):
        """
        Initialize OpenAI client with the provided API key.
        """
        openai.api_key = api_key
    async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict:
        """
        Generate a response using OpenAI's chat completion API.
        """
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=max_tokens
            )
            content = response['choices'][0]['message']['content']
            # Parse the JSON string into a dictionary
            parsed_content = json.loads(content)
            return {
                "response": parsed_content,  # Now returns a dictionary instead of string
                "prompt_tokens": response['usage']['prompt_tokens'],
                "completion_tokens": response['usage']['completion_tokens'],
                "total_tokens": response['usage']['total_tokens']
            }
        except json.JSONDecodeError as e:
            raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
        except Exception as e:
            raise Exception(f"OpenAI text generation error: {str(e)}")
    def get_embeddings(self, texts: List[str]) -> List[List[float]]:
        """
        Retrieve embeddings for a list of texts using OpenAI's embedding API.
        """
        try:
            response = openai.Embedding.create(
                input=texts,
                model="text-embedding-ada-002"
            )
            embeddings = [data['embedding'] for data in response['data']]
            return embeddings
        except Exception as e:
            raise Exception(f"OpenAI embedding error: {str(e)}")
 class AIFactChecker:
    def __init__(self, openai_client: OpenAIClient):
        """Initialize the fact checker with OpenAI client."""
        self.openai_client = openai_client
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
    async def scrape_webpage(self, url: str) -> List[Document]:
        """Scrape webpage content using LangChain's AsyncHtmlLoader."""
        try:
            loader = AsyncHtmlLoader([url])
            docs = await loader.aload()
            bs_transformer = BeautifulSoupTransformer()
            docs_transformed = bs_transformer.transform_documents(docs)
            docs_chunks = self.text_splitter.split_documents(docs_transformed)
            logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
            return docs_chunks
        except Exception as e:
            logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
            raise
    def find_relevant_chunks(
        self, 
        query_embedding: List[float], 
        doc_embeddings: List[List[float]], 
        docs: List[Document]
    ) -> List[Document]:
        """Find most relevant document chunks using cosine similarity."""
        try:
            query_array = np.array(query_embedding)
            chunks_array = np.array(doc_embeddings)
            similarities = np.dot(chunks_array, query_array) / (
                np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array)
            )
            top_indices = np.argsort(similarities)[-5:][::-1]
            return [docs[i] for i in top_indices]
        except Exception as e:
            logger.error(f"Error finding relevant chunks | error={str(e)}")
            raise
    async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]:
        """Verify fact using OpenAI's API with context from relevant documents."""
        try:
            context = "\n\n".join([doc.page_content for doc in relevant_docs])
            system_prompt = """You are a professional fact-checking assistant. Analyze the provided context 
            and determine if the given statement is true, false, or if there isn't enough information.
            Provide your response in the following JSON format:
            {
                "verdict": "True/False/Insufficient Information",
                "confidence": "High/Medium/Low",
                "evidence": "Direct quotes or evidence from the context",
                "reasoning": "Your detailed analysis and reasoning",
                "missing_info": "Any important missing information (if applicable)"
            }"""
            user_prompt = f"""Context:
            {context}
            Statement to verify: "{query}"
            Analyze the statement based on the provided context and return your response in the specified JSON format."""
            response = await self.openai_client.generate_text_response(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                max_tokens=800
            )
            sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs]))
            return {
                "verification_result": response["response"],  # This is now a dictionary
                "sources": sources,
                "token_usage": {
                    "prompt_tokens": response["prompt_tokens"],
                    "completion_tokens": response["completion_tokens"],
                    "total_tokens": response["total_tokens"]
                }
            }
        except Exception as e:
            logger.error(f"Error verifying fact | error={str(e)}")
            raise
    async def check_fact(self, url: str, query: str) -> Dict[str, Any]:
        """Main method to check a fact against a webpage."""
        try:
            docs = await self.scrape_webpage(url)
            doc_texts = [doc.page_content for doc in docs]
            doc_embeddings = self.openai_client.get_embeddings(doc_texts)
            query_embedding = self.openai_client.get_embeddings([query])
            relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs)
            verification_result = await self.verify_fact(query, relevant_docs)
            return verification_result
        except Exception as e:
            logger.error(f"Error checking fact | error={str(e)}")
            raise
--- a/app/websites/pycache/fact_checker_website.cpython-312.pyc
+++ b/app/websites/pycache/fact_checker_website.cpython-312.pyc
--- a/app/websites/fact_checker_website.py
+++ b/app/websites/fact_checker_website.py
@ -0,0 +1,190 @@
 from typing import Dict, List
 import requests
 from fastapi import HTTPException
 from app.models.ai_fact_check_models import FactCheckSource, ErrorResponse, FactCheckRequest, SourceType
 # Sources configuration with validation
 SOURCES = {
    "fact_checkers": [
        FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
        for domain in [
    "snopes.com",
    "politifact.com",
    "factcheck.org",
    "reuters.com/fact-check",
    "apnews.com/hub/ap-fact-check",
    "bbc.com/news/reality_check",
    "fullfact.org",
    "afp.com/fact-check",
    "truthorfiction.com",
    "leadstories.com",
    "checkyourfact.com",
    "washingtonpost.com/news/fact-checker",
    "factcheck.kz",
    "poynter.org/ifcn",
    "factcheckeu.info",
    "africacheck.org",
    "thequint.com/webqoof",
    "altnews.in",
    "facta.news",
    "factcheckni.org",
    "mythdetector.ge",
    "verificado.mx",
    "euvsdisinfo.eu",
    "factcheck.afp.com",
    "newtral.es",
    "maldita.es",
    "faktograf.hr",
    "demagog.org.pl",
    "factnameh.com",
    "faktiskt.se",
    "teyit.org",
    "factly.in",
    "boom.live",
    "stopfake.org",
    "factcheck.ge",
    "factcheck.kg",
    "factcheck.uz",
    "factcheck.tj",
    "factcheck.az",
    "factcheck.am",
    "factcheck.md",
    "verafiles.org",
    "rappler.com/fact-check",
    "vera.com.gt",
    "chequeado.com",
    "aosfatos.org",
    "lasillavacia.com/detector-mentiras",
    "colombiacheck.com",
    "ecuadorchequea.com",
    "elsurti.com/checado",
    "verificat.cat",
    "mafindo.or.id",
    "tempo.co/cek-fakta",
    "factcheck.mk",
    "raskrinkavanje.ba",
    "faktograf.hr",
    "demagog.cz",
    "faktabaari.fi",
    "correctiv.org",
    "mimikama.at",
    "factcheck.vlaanderen",
    "factuel.afp.com",
    "nieuwscheckers.nl",
    "faktisk.no",
    "tjekdet.dk",
    "ellinikahoaxes.gr",
    "faktograf.id",
    "stopfake.kz",
    "pesacheck.org",
    "dubawa.org",
    "namibiafactcheck.org.na",
    "zimfact.org",
    "ghanafact.com",
    "factspace.africa",
    "factcrescendo.com",
    "vishvasnews.com",
    "factcheck.lk",
    "newschecker.in",
    "boomlive.in",
    "digiteye.in",
    "indiatoday.in/fact-check",
    "factcrescendo.com",
    "piyasa.com/fact-check",
    "taiwanese.facts.news",
    "taiwanfactcheck.com",
    "mygopen.com",
    "tfc-taiwan.org.tw",
    "cofacts.tw",
    "rumor.taipei",
    "fact.qq.com",
    "factcheck.afp.com/list",
    "acfta.org",
    "crosscheck.firstdraftnews.org",
    "healthfeedback.org",
    "climatefeedback.org",
    "sciencefeedback.co",
    "factcheck.aap.com.au",
    "emergent.info",
    "hoax-slayer.net",
    "truthorfiction.com",
    "factcheck.media",
    "mediawise.org",
    "thejournal.ie/factcheck",
    "journalistsresource.org",
    "metafact.io",
    "reporterslab.org/fact-checking"
 ]
    ],
    "news_sites": [
        FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
        for domain in [
            "www.thedailystar.net",
            "www.thefinancialexpress.com.bd",
            "www.theindependentbd.com",
            "www.dhakatribune.com",
            "www.newagebd.net",
            "www.observerbd.com",
            "www.daily-sun.com",
            "www.tbsnews.net",
            "www.businesspostbd.com",
            "www.banglanews24.com/english",
            "www.bdnews24.com/english",
            "www.risingbd.com/english",
            "www.dailyindustry.news",
            "www.bangladeshpost.net",
            "www.daily-bangladesh.com/english"
        ]
    ]
 }
 async def fetch_fact_checks(
    api_key: str,
    base_url: str,
    query: str,
    site: FactCheckSource
 ) -> Dict:
    """
    Fetch fact checks from a specific site using the Google Fact Check API
    """
    try:
        if not api_key or not base_url:
            raise ValueError("API key or base URL not configured")
        params = {
            "key": api_key,
            "query": query,
            "languageCode": "en-US",
            "reviewPublisherSiteFilter": site.domain,
            "pageSize": 10
        }
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        raise HTTPException(
            status_code=503,
            detail=ErrorResponse(
                detail=f"Error fetching from {site.domain}: {str(e)}",
                error_code="FACT_CHECK_SERVICE_ERROR",
                path="/check-facts"
            ).dict()
        )
    except ValueError as e:
        raise HTTPException(
            status_code=500,
            detail=ErrorResponse(
                detail=str(e),
                error_code="CONFIGURATION_ERROR",
                path="/check-facts"
            ).dict()
        )
 def get_all_sources() -> List[FactCheckSource]:
    """
    Get all sources sorted by priority
    """
    # all_sources = SOURCES["fact_checkers"] + SOURCES["news_sites"]
    all_sources = SOURCES["fact_checkers"] 
    return sorted(all_sources, key=lambda x: x.priority)
--- a/main.py
+++ b/main.py
@ -1,6 +1,8 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from app.api.fact_check import fact_check_router
 from app.api.ai_fact_check import aifact_check_router
 from app.api.scrap_websites import scrap_websites_router
 from app.config import FRONTEND_URL
 # Initialize FastAPI app
@ -39,6 +41,8 @@ async def health_check():
    return {"status": "healthy"}
 app.include_router(fact_check_router, prefix="")
 app.include_router(aifact_check_router, prefix="")
 app.include_router(scrap_websites_router, prefix="")
 # Include routers (uncomment and modify as needed)
 # from routes import some_router
--- a/search_response_altnews_in.html
+++ b/search_response_altnews_in.html
--- a/search_response_bbc_com.html
+++ b/search_response_bbc_com.html
--- a/search_response_en_prothomalo_com.html
+++ b/search_response_en_prothomalo_com.html