fact-checker-backend/app/api/fact_check.py
2024-12-09 17:18:09 +06:00

291 lines
No EOL
8.7 KiB
Python

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
from typing import Dict, List, Optional, Union
import requests
from enum import Enum
from datetime import datetime
import json
from app.config import GOOGLE_FACT_CHECK_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
fact_check_router = APIRouter()
class CustomJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
return super().default(obj)
class ErrorResponse(BaseModel):
detail: str
error_code: str = Field(..., description="Unique error code for this type of error")
timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
path: Optional[str] = Field(None, description="The endpoint path where error occurred")
model_config = ConfigDict(json_schema_extra={
"example": {
"detail": "Error description",
"error_code": "ERROR_CODE",
"timestamp": "2024-12-09T16:49:30.905765",
"path": "/check-facts"
}
})
class RequestValidationError(BaseModel):
loc: List[str]
msg: str
type: str
class Publisher(BaseModel):
name: str
site: Optional[str] = Field(None, description="Publisher's website")
@validator('site')
def validate_site(cls, v):
if v and not (v.startswith('http://') or v.startswith('https://')):
return f"https://{v}"
return v
class ClaimReview(BaseModel):
publisher: Publisher
url: Optional[HttpUrl] = None
title: Optional[str] = None
reviewDate: Optional[str] = None
textualRating: Optional[str] = None
languageCode: str = Field(default="en-US")
class Claim(BaseModel):
text: str
claimant: Optional[str] = None
claimDate: Optional[str] = None
claimReview: List[ClaimReview]
class FactCheckResponse(BaseModel):
query: str = Field(..., description="Original query that was fact-checked")
total_claims_found: int = Field(..., ge=0)
results: List[Claim] = Field(default_factory=list)
summary: Dict[str, int] = Field(...)
model_config = ConfigDict(json_schema_extra={
"example": {
"query": "Example claim",
"total_claims_found": 1,
"results": [{
"text": "Example claim text",
"claimant": "Source name",
"claimReview": [{
"publisher": {
"name": "Fact Checker",
"site": "factchecker.com"
},
"textualRating": "True"
}]
}],
"summary": {
"total_sources": 1,
"fact_checking_sites_queried": 10
}
}
})
class SourceType(str, Enum):
FACT_CHECKER = "fact_checker"
NEWS_SITE = "news_site"
class FactCheckSource(BaseModel):
domain: str
type: SourceType
priority: int = Field(default=1, ge=1, le=10)
model_config = ConfigDict(json_schema_extra={
"example": {
"domain": "factcheck.org",
"type": "fact_checker",
"priority": 1
}
})
# Sources configuration with validation
SOURCES = {
"fact_checkers": [
FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
for domain in [
"factcheck.org",
"snopes.com",
"politifact.com",
"reuters.com",
"bbc.com",
"apnews.com",
"usatoday.com",
"nytimes.com",
"washingtonpost.com",
"afp.com",
"fullfact.org",
"truthorfiction.com",
"leadstories.com",
"altnews.in",
"boomlive.in",
"en.prothomalo.com"
]
],
"news_sites": [
FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
for domain in [
"www.thedailystar.net",
"www.thefinancialexpress.com.bd",
"www.theindependentbd.com",
"www.dhakatribune.com",
"www.newagebd.net",
"www.observerbd.com",
"www.daily-sun.com",
"www.tbsnews.net",
"www.businesspostbd.com",
"www.banglanews24.com/english",
"www.bdnews24.com/english",
"www.risingbd.com/english",
"www.dailyindustry.news",
"www.bangladeshpost.net",
"www.daily-bangladesh.com/english"
]
]
}
class FactCheckRequest(BaseModel):
content: str = Field(
...,
min_length=10,
max_length=1000,
description="The claim to be fact-checked"
)
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
max_results_per_source: int = Field(default=10, ge=1, le=50)
@validator('content')
def validate_content(cls, v):
if not v.strip():
raise ValueError("Content cannot be empty or just whitespace")
return v.strip()
async def fetch_fact_checks(
api_key: str,
base_url: str,
query: str,
site: FactCheckSource
) -> Dict:
"""
Fetch fact checks from a specific site using the Google Fact Check API
"""
try:
if not api_key or not base_url:
raise ValueError("API key or base URL not configured")
params = {
"key": api_key,
"query": query,
"languageCode": "en-US",
"reviewPublisherSiteFilter": site.domain,
"pageSize": 10
}
response = requests.get(base_url, params=params)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
raise HTTPException(
status_code=503,
detail=ErrorResponse(
detail=f"Error fetching from {site.domain}: {str(e)}",
error_code="FACT_CHECK_SERVICE_ERROR",
path="/check-facts"
).dict()
)
except ValueError as e:
raise HTTPException(
status_code=500,
detail=ErrorResponse(
detail=str(e),
error_code="CONFIGURATION_ERROR",
path="/check-facts"
).dict()
)
@fact_check_router.post(
"/check-facts",
response_model=FactCheckResponse,
responses={
400: {"model": ErrorResponse},
404: {"model": ErrorResponse},
500: {"model": ErrorResponse},
503: {"model": ErrorResponse}
}
)
async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
"""
Check facts using multiple fact-checking sources
"""
all_results = []
# Validate configuration
if not GOOGLE_FACT_CHECK_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
raise HTTPException(
status_code=500,
detail=ErrorResponse(
detail="API configuration is missing",
error_code="CONFIGURATION_ERROR",
path="/check-facts"
).dict()
)
# Check all sources in priority order
all_sources = (
SOURCES["fact_checkers"] +
SOURCES["news_sites"]
)
all_sources.sort(key=lambda x: x.priority)
for source in all_sources:
try:
result = await fetch_fact_checks(
GOOGLE_FACT_CHECK_API_KEY,
GOOGLE_FACT_CHECK_BASE_URL,
request.content,
source
)
if "claims" in result:
# Validate each claim through Pydantic
validated_claims = [
Claim(**claim).dict()
for claim in result["claims"]
]
all_results.extend(validated_claims)
except HTTPException:
raise
except Exception as e:
# Log the error but continue with other sources
print(f"Error processing {source.domain}: {str(e)}")
continue
if not all_results:
raise HTTPException(
status_code=404,
detail=ErrorResponse(
detail="No fact check results found",
error_code="NO_RESULTS_FOUND",
path="/check-facts"
).dict()
)
# Create the response using Pydantic model
response = FactCheckResponse(
query=request.content,
total_claims_found=len(all_results),
results=all_results,
summary={
"total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "")
for claim in all_results if claim.get("claimReview"))),
"fact_checking_sites_queried": len(all_sources)
}
)
return response