diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 676a881..2f4fa69 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,7 +11,6 @@ cache: stages: - setup - - lint - test before_script: @@ -29,14 +28,6 @@ setup: - venv/ expire_in: 1 hour -lint: - stage: lint - needs: - - setup - script: - - black --check app/ main.py tests/ - - flake8 app/ main.py tests/ --max-line-length=100 - test: stage: test needs: @@ -47,7 +38,7 @@ test: # Start FastAPI server - uvicorn main:app --host 0.0.0.0 --port 8000 & # Wait for server to start - - sleep 10 + - sleep 15 # Test health endpoint - | RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/health) diff --git a/app/__pycache__/config.cpython-312.pyc b/app/__pycache__/config.cpython-312.pyc index b086fe1..74c6db2 100644 Binary files a/app/__pycache__/config.cpython-312.pyc and b/app/__pycache__/config.cpython-312.pyc differ diff --git a/app/api/__pycache__/fact_check.cpython-312.pyc b/app/api/__pycache__/fact_check.cpython-312.pyc index b5709d2..f784c29 100644 Binary files a/app/api/__pycache__/fact_check.cpython-312.pyc and b/app/api/__pycache__/fact_check.cpython-312.pyc differ diff --git a/app/api/ai_fact_check.py b/app/api/ai_fact_check.py index 6d1f2d7..c848b1b 100644 --- a/app/api/ai_fact_check.py +++ b/app/api/ai_fact_check.py @@ -6,7 +6,7 @@ from app.models.ai_fact_check_models import ( AIFactCheckResponse, VerificationResult, TokenUsage, - ErrorResponse + ErrorResponse, ) from urllib.parse import urlparse import asyncio @@ -16,13 +16,11 @@ aifact_check_router = APIRouter() openai_client = OpenAIClient(api_key=OPENAI_API_KEY) fact_checker = AIFactChecker(openai_client=openai_client) + @aifact_check_router.post( "/aicheck-facts", response_model=AIFactCheckResponse, - responses={ - 400: {"model": ErrorResponse}, - 500: {"model": ErrorResponse} - } + responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}}, ) async def ai_fact_check(request: AIFactCheckRequest): """ @@ -40,14 +38,14 @@ async def ai_fact_check(request: AIFactCheckRequest): total_prompt_tokens = 0 total_completion_tokens = 0 total_tokens = 0 - + # Process all URLs concurrently tasks = [ fact_checker.check_fact(url=url, query=request.content) for url in request.urls ] fact_check_results = await asyncio.gather(*tasks, return_exceptions=True) - + # Process results for url, result in zip(request.urls, fact_check_results): if isinstance(result, Exception): @@ -57,21 +55,21 @@ async def ai_fact_check(request: AIFactCheckRequest): confidence="Low", evidence=f"Error checking URL: {str(result)}", reasoning="URL processing failed", - missing_info="Could not access or process the URL" + missing_info="Could not access or process the URL", ) continue - + verification_result = VerificationResult( verdict=result["verification_result"]["verdict"], confidence=result["verification_result"]["confidence"], evidence=result["verification_result"]["evidence"], reasoning=result["verification_result"]["reasoning"], - missing_info=result["verification_result"].get("missing_info", None) + missing_info=result["verification_result"].get("missing_info", None), ) - + results[url] = verification_result all_sources.update(result["sources"]) - + # Accumulate token usage total_prompt_tokens += result["token_usage"]["prompt_tokens"] total_completion_tokens += result["token_usage"]["completion_tokens"] @@ -80,24 +78,22 @@ async def ai_fact_check(request: AIFactCheckRequest): token_usage = TokenUsage( prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, - total_tokens=total_tokens + total_tokens=total_tokens, ) return AIFactCheckResponse( query=request.content, verification_result=results, sources=list(all_sources), - token_usage=token_usage + token_usage=token_usage, ) except ValueError as e: raise HTTPException( status_code=400, detail=ErrorResponse( - detail=str(e), - error_code="INVALID_URL", - path="/aicheck-facts" - ).dict() + detail=str(e), error_code="INVALID_URL", path="/aicheck-facts" + ).dict(), ) except Exception as e: raise HTTPException( @@ -105,6 +101,6 @@ async def ai_fact_check(request: AIFactCheckRequest): detail=ErrorResponse( detail=f"Error processing fact-check request: {str(e)}", error_code="PROCESSING_ERROR", - path="/aicheck-facts" - ).dict() - ) \ No newline at end of file + path="/aicheck-facts", + ).dict(), + ) diff --git a/app/api/fact_check.py b/app/api/fact_check.py index b52ef24..ab4cd9f 100644 --- a/app/api/fact_check.py +++ b/app/api/fact_check.py @@ -4,16 +4,17 @@ from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KE from app.api.scrap_websites import search_websites, SearchRequest from app.services.openai_client import OpenAIClient from app.models.fact_check_models import ( - FactCheckRequest, - FactCheckResponse, + FactCheckRequest, + FactCheckResponse, ErrorResponse, - Source + Source, ) from app.websites.fact_checker_website import get_all_sources fact_check_router = APIRouter() openai_client = OpenAIClient(OPENAI_API_KEY) + async def generate_fact_report(query: str, fact_check_data: dict) -> FactCheckResponse: """Generate a fact check report using OpenAI based on the fact check results.""" try: @@ -55,7 +56,7 @@ Ensure all URLs in sources are complete (including https:// if missing) and each 2. Specify verification dates when available 3. Name the fact-checking organizations involved 4. Describe the verification process""" - + else: system_prompt = base_system_prompt user_prompt = f"""Query: {query} @@ -70,37 +71,34 @@ Ensure all URLs in sources are complete (including https:// if missing) and each 4. Note any conflicting information between sources""" response = await openai_client.generate_text_response( - system_prompt=system_prompt, - user_prompt=user_prompt, - max_tokens=1000 + system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=1000 ) - + try: # First try to parse the response directly response_data = response["response"] - + # Clean up sources before validation - if isinstance(response_data.get('sources'), list): + if isinstance(response_data.get("sources"), list): cleaned_sources = [] - for source in response_data['sources']: + for source in response_data["sources"]: if isinstance(source, str): # Convert string sources to Source objects - url = source if source.startswith('http') else f"https://{source}" - cleaned_sources.append({ - "url": url, - "name": source - }) + url = ( + source if source.startswith("http") else f"https://{source}" + ) + cleaned_sources.append({"url": url, "name": source}) elif isinstance(source, dict): # Ensure URL has proper scheme - url = source.get('url', '') - if url and not url.startswith('http'): - source['url'] = f"https://{url}" + url = source.get("url", "") + if url and not url.startswith("http"): + source["url"] = f"https://{url}" cleaned_sources.append(source) - response_data['sources'] = cleaned_sources - + response_data["sources"] = cleaned_sources + fact_check_response = FactCheckResponse(**response_data) return fact_check_response - + except Exception as validation_error: print(f"Response validation error: {str(validation_error)}") raise HTTPException( @@ -108,10 +106,10 @@ Ensure all URLs in sources are complete (including https:// if missing) and each detail=ErrorResponse( detail=f"Invalid response format: {str(validation_error)}", error_code="VALIDATION_ERROR", - path="/check-facts" - ).dict() + path="/check-facts", + ).dict(), ) - + except Exception as e: print(f"Error generating fact report: {str(e)}") raise HTTPException( @@ -119,10 +117,11 @@ Ensure all URLs in sources are complete (including https:// if missing) and each detail=ErrorResponse( detail="Error generating fact report", error_code="FACT_CHECK_ERROR", - path="/check-facts" - ).dict() + path="/check-facts", + ).dict(), ) + @fact_check_router.post("/check-facts", response_model=FactCheckResponse) async def check_facts(request: FactCheckRequest): """ @@ -134,52 +133,49 @@ async def check_facts(request: FactCheckRequest): detail=ErrorResponse( detail="Google API key or base URL is not configured", error_code="CONFIGURATION_ERROR", - path="/check-facts" - ).dict() + path="/check-facts", + ).dict(), ) headers = {"Content-Type": "application/json"} async with httpx.AsyncClient() as client: # Get fact checker sources from the centralized configuration fact_checker_sources = get_all_sources() - + for source in fact_checker_sources: params = { "key": GOOGLE_API_KEY, "query": request.query, "languageCode": "en-US", "reviewPublisherSiteFilter": source.domain, - "pageSize": 10 + "pageSize": 10, } try: response = await client.get( - GOOGLE_FACT_CHECK_BASE_URL, - params=params, - headers=headers + GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers ) response.raise_for_status() json_response = response.json() if json_response.get("claims"): return await generate_fact_report(request.query, json_response) - + except httpx.RequestError as e: print(f"Error fetching results for site {source.domain}: {str(e)}") continue except Exception as e: print(f"Unexpected error for site {source.domain}: {str(e)}") continue - + try: search_request = SearchRequest( - search_text=request.query, - source_types=["fact_checkers"] + search_text=request.query, source_types=["fact_checkers"] ) - + ai_response = await search_websites(search_request) return await generate_fact_report(request.query, ai_response) - + except Exception as e: print(f"Error in AI fact check: {str(e)}") raise HTTPException( @@ -187,6 +183,6 @@ async def check_facts(request: FactCheckRequest): detail=ErrorResponse( detail="No fact check results found", error_code="NOT_FOUND", - path="/check-facts" - ).dict() - ) \ No newline at end of file + path="/check-facts", + ).dict(), + ) diff --git a/app/api/scrap_websites.py b/app/api/scrap_websites.py index 946ec01..f685158 100644 --- a/app/api/scrap_websites.py +++ b/app/api/scrap_websites.py @@ -7,7 +7,7 @@ from pydantic import BaseModel from app.models.ai_fact_check_models import ( AIFactCheckRequest, FactCheckSource, - SourceType + SourceType, ) from app.websites.fact_checker_website import SOURCES, get_all_sources from app.api.ai_fact_check import ai_fact_check @@ -18,10 +18,10 @@ class SearchRequest(BaseModel): search_text: str source_types: List[str] = ["fact_checkers"] + # Configure logging logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) @@ -38,51 +38,58 @@ def get_domain_from_url(url: str) -> str: try: parsed = urlparse(url) domain = parsed.netloc.lower() - if domain.startswith('www.'): + if domain.startswith("www."): domain = domain[4:] return domain except Exception as e: logger.error(f"Error extracting domain from URL {url}: {str(e)}") return "" + def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool: """Check if domain matches any source with improved matching logic.""" if not domain: return False - + domain = domain.lower() - if domain.startswith('www.'): + if domain.startswith("www."): domain = domain[4:] - + for source in sources: source_domain = source.domain.lower() - if source_domain.startswith('www.'): + if source_domain.startswith("www."): source_domain = source_domain[4:] - - if domain == source_domain or domain.endswith('.' + source_domain): + + if domain == source_domain or domain.endswith("." + source_domain): return True - + return False -async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str: + +async def build_enhanced_search_query( + query: str, sources: List[FactCheckSource] +) -> str: """Build search query with site restrictions.""" site_queries = [f"site:{source.domain}" for source in sources] site_restriction = " OR ".join(site_queries) return f"({query}) ({site_restriction})" -async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]: + +async def google_custom_search( + query: str, sources: List[FactCheckSource], page: int = 1 +) -> Optional[Dict]: """Perform Google Custom Search with enhanced query.""" enhanced_query = await build_enhanced_search_query(query, sources) start_index = ((page - 1) * RESULTS_PER_PAGE) + 1 - + params = { "key": GOOGLE_API_KEY, "cx": GOOGLE_ENGINE_ID, "q": enhanced_query, "num": RESULTS_PER_PAGE, - "start": start_index + "start": start_index, } - + async with httpx.AsyncClient(timeout=30.0) as client: try: response = await client.get(GOOGLE_SEARCH_URL, params=params) @@ -92,69 +99,70 @@ async def google_custom_search(query: str, sources: List[FactCheckSource], page: logger.error(f"Search error: {str(e)}") raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") + @scrap_websites_router.post("/search") async def search_websites(request: SearchRequest): # Get the source types from the request source_types = request.source_types if request.source_types else ["fact_checkers"] - + # Get sources based on requested types selected_sources = [] for source_type in source_types: if source_type in SOURCES: selected_sources.extend(SOURCES[source_type]) - + # If no valid sources found, use fact checkers as default if not selected_sources: selected_sources = SOURCES["fact_checkers"] - + all_urls = [] domain_results = {} - + try: for page in range(1, MAX_PAGES + 1): if len(all_urls) >= 50: break - - search_response = await google_custom_search(request.search_text, selected_sources, page) - + + search_response = await google_custom_search( + request.search_text, selected_sources, page + ) + if not search_response or not search_response.get("items"): break - + for item in search_response.get("items", []): url = item.get("link") if not url: continue - + domain = get_domain_from_url(url) - + if is_valid_source_domain(domain, selected_sources): if domain not in domain_results: domain_results[domain] = [] - + if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN: - domain_results[domain].append({ - "url": url, - "title": item.get("title", ""), - "snippet": item.get("snippet", "") - }) + domain_results[domain].append( + { + "url": url, + "title": item.get("title", ""), + "snippet": item.get("snippet", ""), + } + ) all_urls.append(url) - + if len(all_urls) >= 50: break - + if not all_urls: - return { - "status": "no_results", - "urls_found": 0 - } - + return {"status": "no_results", "urls_found": 0} + fact_check_request = AIFactCheckRequest( - content=request.search_text, - urls=all_urls[:5] + content=request.search_text, urls=all_urls[:5] ) - + return await ai_fact_check(fact_check_request) except Exception as e: logger.error(f"Error during search/fact-check process: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/config.py b/app/config.py index b890247..6e7437c 100644 --- a/app/config.py +++ b/app/config.py @@ -4,9 +4,9 @@ from dotenv import load_dotenv load_dotenv() GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"] -GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"] +GOOGLE_FACT_CHECK_BASE_URL = os.environ["GOOGLE_FACT_CHECK_BASE_URL"] GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"] GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"] OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] -FRONTEND_URL = os.environ["FRONTEND_URL"] \ No newline at end of file +FRONTEND_URL = os.environ["FRONTEND_URL"] diff --git a/app/models/__pycache__/fact_check_models.cpython-312.pyc b/app/models/__pycache__/fact_check_models.cpython-312.pyc index 7cb8e9a..1e810e2 100644 Binary files a/app/models/__pycache__/fact_check_models.cpython-312.pyc and b/app/models/__pycache__/fact_check_models.cpython-312.pyc differ diff --git a/app/models/ai_fact_check_models.py b/app/models/ai_fact_check_models.py index 0949e51..525b1cb 100644 --- a/app/models/ai_fact_check_models.py +++ b/app/models/ai_fact_check_models.py @@ -4,38 +4,46 @@ from enum import Enum from datetime import datetime from urllib.parse import urlparse + # Common Models class TokenUsage(BaseModel): prompt_tokens: Optional[int] = 0 completion_tokens: Optional[int] = 0 total_tokens: Optional[int] = 0 + class ErrorResponse(BaseModel): detail: str error_code: str = Field(..., description="Unique error code for this type of error") timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - path: Optional[str] = Field(None, description="The endpoint path where error occurred") + path: Optional[str] = Field( + None, description="The endpoint path where error occurred" + ) - model_config = ConfigDict(json_schema_extra={ - "example": { - "detail": "Error description", - "error_code": "ERROR_CODE", - "timestamp": "2024-12-09T16:49:30.905765", - "path": "/check-facts" + model_config = ConfigDict( + json_schema_extra={ + "example": { + "detail": "Error description", + "error_code": "ERROR_CODE", + "timestamp": "2024-12-09T16:49:30.905765", + "path": "/check-facts", + } } - }) + ) + # Fact Check Models class Publisher(BaseModel): name: str site: Optional[str] = Field(None, description="Publisher's website") - - @validator('site') + + @validator("site") def validate_site(cls, v): - if v and not (v.startswith('http://') or v.startswith('https://')): + if v and not (v.startswith("http://") or v.startswith("https://")): return f"https://{v}" return v + class ClaimReview(BaseModel): publisher: Publisher url: Optional[HttpUrl] = None @@ -44,21 +52,25 @@ class ClaimReview(BaseModel): textualRating: Optional[str] = None languageCode: str = Field(default="en-US") + class Claim(BaseModel): text: str claimant: Optional[str] = None claimDate: Optional[str] = None claimReview: List[ClaimReview] + class SourceType(str, Enum): FACT_CHECKER = "fact_checker" NEWS_SITE = "news_site" + class FactCheckSource(BaseModel): domain: str type: SourceType priority: int = Field(default=1, ge=1, le=10) + # Verification Models class VerificationResult(BaseModel): verdict: str = Field(..., description="True/False/Insufficient Information") @@ -67,54 +79,56 @@ class VerificationResult(BaseModel): reasoning: str missing_info: Optional[str] = None - model_config = ConfigDict(json_schema_extra={ - "example": { - "verdict": "True", - "confidence": "High", - "evidence": ["Direct quote from source supporting the claim"], - "reasoning": "Detailed analysis of why the claim is considered true", - "missing_info": "Any caveats or limitations of the verification" + model_config = ConfigDict( + json_schema_extra={ + "example": { + "verdict": "True", + "confidence": "High", + "evidence": ["Direct quote from source supporting the claim"], + "reasoning": "Detailed analysis of why the claim is considered true", + "missing_info": "Any caveats or limitations of the verification", + } } - }) + ) + # Request Models class BaseFactCheckRequest(BaseModel): content: str = Field( - ..., - min_length=10, - max_length=1000, - description="The claim to be fact-checked" + ..., min_length=10, max_length=1000, description="The claim to be fact-checked" ) - - @validator('content') + + @validator("content") def validate_content(cls, v): if not v.strip(): raise ValueError("Content cannot be empty or just whitespace") return v.strip() + class GoogleFactCheckRequest(BaseFactCheckRequest): language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$") max_results_per_source: int = Field(default=10, ge=1, le=50) + class AIFactCheckRequest(BaseFactCheckRequest): urls: List[str] = Field( ..., min_items=1, max_items=5, - description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing" + description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing", ) - - @validator('urls') + + @validator("urls") def validate_urls(cls, urls): validated_urls = [] for url in urls: if not url.strip(): raise ValueError("URL cannot be empty") - + # Add https:// if no protocol specified - if not url.startswith(('http://', 'https://')): - url = f'https://{url}' - + if not url.startswith(("http://", "https://")): + url = f"https://{url}" + try: result = urlparse(url) if not result.netloc: @@ -122,18 +136,21 @@ class AIFactCheckRequest(BaseFactCheckRequest): validated_urls.append(url) except Exception as e: raise ValueError(f"Invalid URL {url}: {str(e)}") - + return validated_urls - model_config = ConfigDict(json_schema_extra={ - "example": { - "content": "Indian flag was drawn in BUET campus", - "urls": [ - "www.altnews.in/article-about-flag", - "www.another-source.com/related-news" - ] + model_config = ConfigDict( + json_schema_extra={ + "example": { + "content": "Indian flag was drawn in BUET campus", + "urls": [ + "www.altnews.in/article-about-flag", + "www.another-source.com/related-news", + ], + } } - }) + ) + # Response Models class BaseFactCheckResponse(BaseModel): @@ -141,17 +158,20 @@ class BaseFactCheckResponse(BaseModel): token_usage: TokenUsage sources: List[str] - model_config = ConfigDict(json_schema_extra={ - "example": { - "query": "Example statement to verify", - "token_usage": { - "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 - }, - "sources": ["source1.com", "source2.com"], + model_config = ConfigDict( + json_schema_extra={ + "example": { + "query": "Example statement to verify", + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150, + }, + "sources": ["source1.com", "source2.com"], + } } - }) + ) + class GoogleFactCheckResponse(BaseFactCheckResponse): total_claims_found: int @@ -159,71 +179,80 @@ class GoogleFactCheckResponse(BaseFactCheckResponse): verification_result: Dict[str, Any] summary: Dict[str, int] - model_config = ConfigDict(json_schema_extra={ - "example": { - "query": "Example claim", - "total_claims_found": 1, - "results": [{ - "text": "Example claim text", - "claimant": "Source name", - "claimReview": [{ - "publisher": { - "name": "Fact Checker", - "site": "factchecker.com" - }, - "textualRating": "True" - }] - }], - "verification_result": { - "verdict": "True", - "confidence": "High", - "evidence": ["Supporting evidence"], - "reasoning": "Detailed analysis" - }, - "sources": ["factchecker.com"], - "token_usage": { - "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 - }, - "summary": { - "total_sources": 1, - "fact_checking_sites_queried": 10 - } - } - }) - -class AIFactCheckResponse(BaseFactCheckResponse): - verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL - - model_config = ConfigDict(json_schema_extra={ - "example": { - "query": "Indian flag was drawn in BUET campus", - "verification_result": { - "https://www.source1.com": { + model_config = ConfigDict( + json_schema_extra={ + "example": { + "query": "Example claim", + "total_claims_found": 1, + "results": [ + { + "text": "Example claim text", + "claimant": "Source name", + "claimReview": [ + { + "publisher": { + "name": "Fact Checker", + "site": "factchecker.com", + }, + "textualRating": "True", + } + ], + } + ], + "verification_result": { "verdict": "True", "confidence": "High", - "evidence": ["Supporting evidence from source 1"], - "reasoning": "Detailed analysis from source 1", - "missing_info": None + "evidence": ["Supporting evidence"], + "reasoning": "Detailed analysis", }, - "https://www.source2.com": { - "verdict": "True", - "confidence": "Medium", - "evidence": ["Supporting evidence from source 2"], - "reasoning": "Analysis from source 2", - "missing_info": "Additional context needed" - } - }, - "sources": ["source1.com", "source2.com"], - "token_usage": { - "prompt_tokens": 200, - "completion_tokens": 100, - "total_tokens": 300 + "sources": ["factchecker.com"], + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150, + }, + "summary": {"total_sources": 1, "fact_checking_sites_queried": 10}, } } - }) + ) + + +class AIFactCheckResponse(BaseFactCheckResponse): + verification_result: Dict[ + str, VerificationResult + ] # Changed to Dict to store results per URL + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "query": "Indian flag was drawn in BUET campus", + "verification_result": { + "https://www.source1.com": { + "verdict": "True", + "confidence": "High", + "evidence": ["Supporting evidence from source 1"], + "reasoning": "Detailed analysis from source 1", + "missing_info": None, + }, + "https://www.source2.com": { + "verdict": "True", + "confidence": "Medium", + "evidence": ["Supporting evidence from source 2"], + "reasoning": "Analysis from source 2", + "missing_info": "Additional context needed", + }, + }, + "sources": ["source1.com", "source2.com"], + "token_usage": { + "prompt_tokens": 200, + "completion_tokens": 100, + "total_tokens": 300, + }, + } + } + ) + # Backwards compatibility aliases FactCheckRequest = GoogleFactCheckRequest -FactCheckResponse = GoogleFactCheckResponse \ No newline at end of file +FactCheckResponse = GoogleFactCheckResponse diff --git a/app/models/fact_check_models.py b/app/models/fact_check_models.py index 1b30511..3ab4a8c 100644 --- a/app/models/fact_check_models.py +++ b/app/models/fact_check_models.py @@ -3,74 +3,73 @@ from typing import List, Literal, Union from datetime import datetime from enum import Enum + class VerdictEnum(str, Enum): TRUE = "True" FALSE = "False" PARTIALLY_TRUE = "Partially True" UNVERIFIED = "Unverified" + class ConfidenceEnum(str, Enum): HIGH = "High" MEDIUM = "Medium" LOW = "Low" + class FactCheckRequest(BaseModel): query: str = Field( ..., min_length=3, max_length=500, description="The claim or statement to be fact-checked", - example="Did NASA confirm finding alien structures on Mars in 2024?" + example="Did NASA confirm finding alien structures on Mars in 2024?", ) + class Source(BaseModel): url: str name: str = "" - @validator('url') + @validator("url") def validate_url(cls, v): # Basic URL validation without requiring HTTP/HTTPS if not v or len(v) < 3: raise ValueError("URL must not be empty and must be at least 3 characters") return v + class FactCheckResponse(BaseModel): claim: str = Field( ..., min_length=10, max_length=1000, - description="The exact claim being verified" - ) - verdict: VerdictEnum = Field( - ..., - description="The verification verdict" + description="The exact claim being verified", ) + verdict: VerdictEnum = Field(..., description="The verification verdict") confidence: ConfidenceEnum = Field( - ..., - description="Confidence level in the verdict" + ..., description="Confidence level in the verdict" ) sources: List[Source] = Field( - ..., - min_items=1, - description="List of sources used in verification" + ..., min_items=1, description="List of sources used in verification" ) evidence: str = Field( ..., min_length=20, max_length=500, - description="Concise summary of key evidence" + description="Concise summary of key evidence", ) explanation: str = Field( ..., min_length=50, max_length=1000, - description="Detailed explanation of verification findings" + description="Detailed explanation of verification findings", ) additional_context: str = Field( ..., min_length=20, max_length=500, - description="Important context about the verification" + description="Important context about the verification", ) class Config: @@ -82,20 +81,21 @@ class FactCheckResponse(BaseModel): "sources": [ { "url": "https://www.nasa.gov/mars-exploration", - "name": "NASA Mars Exploration" + "name": "NASA Mars Exploration", }, { "url": "https://factcheck.org/2024/mars-claims", - "name": "FactCheck.org" - } + "name": "FactCheck.org", + }, ], "evidence": "NASA has made no such announcement. Recent Mars rover images show natural rock formations.", "explanation": "Multiple fact-checking organizations investigated this claim. NASA's official communications and Mars mission reports from 2024 contain no mention of alien structures. The viral images being shared are misidentified natural geological formations.", - "additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images." + "additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images.", } } + class ErrorResponse(BaseModel): detail: str error_code: str = Field(..., example="VALIDATION_ERROR") - path: str = Field(..., example="/check-facts") \ No newline at end of file + path: str = Field(..., example="/check-facts") diff --git a/app/models/scrap_websites_models.py b/app/models/scrap_websites_models.py index 1c629c5..39dd949 100644 --- a/app/models/scrap_websites_models.py +++ b/app/models/scrap_websites_models.py @@ -1,38 +1,46 @@ from pydantic import BaseModel from typing import List, Dict + class SearchRequest(BaseModel): search_text: str source_types: List[str] = ["fact_checkers"] + class Publisher(BaseModel): name: str site: str + class ClaimReview(BaseModel): publisher: Publisher textualRating: str + class Claim(BaseModel): claimReview: List[ClaimReview] claimant: str text: str + class Summary(BaseModel): fact_checking_sites_queried: int total_sources: int + class TokenUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int + class VerificationResult(BaseModel): verdict: str confidence: str evidence: List[str] reasoning: str + class EnhancedFactCheckResponse(BaseModel): query: str results: List[Claim] @@ -40,4 +48,4 @@ class EnhancedFactCheckResponse(BaseModel): summary: Summary token_usage: Dict[str, int] total_claims_found: int - verification_result: VerificationResult \ No newline at end of file + verification_result: VerificationResult diff --git a/app/services/openai_client.py b/app/services/openai_client.py index e6d2f76..06e0f46 100644 --- a/app/services/openai_client.py +++ b/app/services/openai_client.py @@ -9,6 +9,7 @@ import json import aiohttp from bs4 import BeautifulSoup + class OpenAIClient: def __init__(self, api_key: str): """ @@ -16,7 +17,9 @@ class OpenAIClient: """ openai.api_key = api_key - async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict: + async def generate_text_response( + self, system_prompt: str, user_prompt: str, max_tokens: int + ) -> dict: """ Generate a response using OpenAI's chat completion API. """ @@ -25,19 +28,19 @@ class OpenAIClient: model="gpt-4", messages=[ {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt} + {"role": "user", "content": user_prompt}, ], - max_tokens=max_tokens + max_tokens=max_tokens, ) - content = response['choices'][0]['message']['content'] + content = response["choices"][0]["message"]["content"] # Parse the JSON string into a dictionary parsed_content = json.loads(content) - + return { "response": parsed_content, # Now returns a dictionary instead of string - "prompt_tokens": response['usage']['prompt_tokens'], - "completion_tokens": response['usage']['completion_tokens'], - "total_tokens": response['usage']['total_tokens'] + "prompt_tokens": response["usage"]["prompt_tokens"], + "completion_tokens": response["usage"]["completion_tokens"], + "total_tokens": response["usage"]["total_tokens"], } except json.JSONDecodeError as e: raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}") @@ -50,14 +53,14 @@ class OpenAIClient: """ try: response = openai.Embedding.create( - input=texts, - model="text-embedding-ada-002" + input=texts, model="text-embedding-ada-002" ) - embeddings = [data['embedding'] for data in response['data']] + embeddings = [data["embedding"] for data in response["data"]] return embeddings except Exception as e: raise Exception(f"OpenAI embedding error: {str(e)}") + class AIFactChecker: def __init__(self, openai_client: OpenAIClient): """Initialize the fact checker with OpenAI client.""" @@ -66,65 +69,71 @@ class AIFactChecker: chunk_size=1000, chunk_overlap=200, length_function=len, - separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] + separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], ) - + async def scrape_webpage(self, url: str) -> List[Document]: """Scrape webpage content without saving HTML files.""" try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: if response.status != 200: - raise Exception(f"Failed to fetch URL: {url}, status: {response.status}") - + raise Exception( + f"Failed to fetch URL: {url}, status: {response.status}" + ) + html_content = await response.text() - + # Parse HTML with BeautifulSoup - soup = BeautifulSoup(html_content, 'html.parser') - + soup = BeautifulSoup(html_content, "html.parser") + # Create a Document with the parsed content doc = Document( - page_content=soup.get_text(separator='\n', strip=True), - metadata={"source": url} + page_content=soup.get_text(separator="\n", strip=True), + metadata={"source": url}, ) - + # Split into chunks docs_chunks = self.text_splitter.split_documents([doc]) - - logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}") + + logger.info( + f"Successfully scraped webpage | chunks={len(docs_chunks)}" + ) return docs_chunks - + except Exception as e: logger.error(f"Error scraping webpage | url={url} | error={str(e)}") raise def find_relevant_chunks( - self, - query_embedding: List[float], - doc_embeddings: List[List[float]], - docs: List[Document] + self, + query_embedding: List[float], + doc_embeddings: List[List[float]], + docs: List[Document], ) -> List[Document]: """Find most relevant document chunks using cosine similarity.""" try: query_array = np.array(query_embedding) chunks_array = np.array(doc_embeddings) - + similarities = np.dot(chunks_array, query_array) / ( np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array) ) - + top_indices = np.argsort(similarities)[-5:][::-1] return [docs[i] for i in top_indices] - + except Exception as e: logger.error(f"Error finding relevant chunks | error={str(e)}") raise - async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]: + async def verify_fact( + self, query: str, relevant_docs: List[Document] + ) -> Dict[str, Any]: """Verify fact using OpenAI's API with context from relevant documents.""" try: context = "\n\n".join([doc.page_content for doc in relevant_docs]) - + system_prompt = """You are a professional fact-checking assistant. Analyze the provided context and determine if the given statement is true, false, or if there isn't enough information. @@ -136,32 +145,37 @@ class AIFactChecker: "reasoning": "Your detailed analysis and reasoning", "missing_info": "Any important missing information (if applicable)" }""" - + user_prompt = f"""Context: {context} Statement to verify: "{query}" Analyze the statement based on the provided context and return your response in the specified JSON format.""" - + response = await self.openai_client.generate_text_response( - system_prompt=system_prompt, - user_prompt=user_prompt, - max_tokens=800 + system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=800 ) - - sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs])) - + + sources = list( + set( + [ + doc.metadata.get("source", "Unknown source") + for doc in relevant_docs + ] + ) + ) + return { "verification_result": response["response"], # This is now a dictionary "sources": sources, "token_usage": { "prompt_tokens": response["prompt_tokens"], "completion_tokens": response["completion_tokens"], - "total_tokens": response["total_tokens"] - } + "total_tokens": response["total_tokens"], + }, } - + except Exception as e: logger.error(f"Error verifying fact | error={str(e)}") raise @@ -170,16 +184,18 @@ class AIFactChecker: """Main method to check a fact against a webpage.""" try: docs = await self.scrape_webpage(url) - + doc_texts = [doc.page_content for doc in docs] doc_embeddings = self.openai_client.get_embeddings(doc_texts) query_embedding = self.openai_client.get_embeddings([query]) - - relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs) + + relevant_docs = self.find_relevant_chunks( + query_embedding[0], doc_embeddings, docs + ) verification_result = await self.verify_fact(query, relevant_docs) - + return verification_result - + except Exception as e: logger.error(f"Error checking fact | error={str(e)}") - raise \ No newline at end of file + raise diff --git a/app/websites/__pycache__/fact_checker_website.cpython-312.pyc b/app/websites/__pycache__/fact_checker_website.cpython-312.pyc index b0b0fa4..e4ce169 100644 Binary files a/app/websites/__pycache__/fact_checker_website.cpython-312.pyc and b/app/websites/__pycache__/fact_checker_website.cpython-312.pyc differ diff --git a/app/websites/fact_checker_website.py b/app/websites/fact_checker_website.py index 2e4934b..8cde5b6 100644 --- a/app/websites/fact_checker_website.py +++ b/app/websites/fact_checker_website.py @@ -1,120 +1,125 @@ from typing import Dict, List import requests from fastapi import HTTPException -from app.models.ai_fact_check_models import FactCheckSource, ErrorResponse, FactCheckRequest, SourceType +from app.models.ai_fact_check_models import ( + FactCheckSource, + ErrorResponse, + FactCheckRequest, + SourceType, +) # Sources configuration with validation SOURCES = { "fact_checkers": [ FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1) for domain in [ - "snopes.com", - "politifact.com", - "factcheck.org", - "reuters.com/fact-check", - "apnews.com/hub/ap-fact-check", - "bbc.com/news/reality_check", - "fullfact.org", - "afp.com/fact-check", - "truthorfiction.com", - "leadstories.com", - "checkyourfact.com", - "washingtonpost.com/news/fact-checker", - "factcheck.kz", - "poynter.org/ifcn", - "factcheckeu.info", - "africacheck.org", - "thequint.com/webqoof", - "altnews.in", - "facta.news", - "factcheckni.org", - "mythdetector.ge", - "verificado.mx", - "euvsdisinfo.eu", - "factcheck.afp.com", - "newtral.es", - "maldita.es", - "faktograf.hr", - "demagog.org.pl", - "factnameh.com", - "faktiskt.se", - "teyit.org", - "factly.in", - "boom.live", - "stopfake.org", - "factcheck.ge", - "factcheck.kg", - "factcheck.uz", - "factcheck.tj", - "factcheck.az", - "factcheck.am", - "factcheck.md", - "verafiles.org", - "rappler.com/fact-check", - "vera.com.gt", - "chequeado.com", - "aosfatos.org", - "lasillavacia.com/detector-mentiras", - "colombiacheck.com", - "ecuadorchequea.com", - "elsurti.com/checado", - "verificat.cat", - "mafindo.or.id", - "tempo.co/cek-fakta", - "factcheck.mk", - "raskrinkavanje.ba", - "faktograf.hr", - "demagog.cz", - "faktabaari.fi", - "correctiv.org", - "mimikama.at", - "factcheck.vlaanderen", - "factuel.afp.com", - "nieuwscheckers.nl", - "faktisk.no", - "tjekdet.dk", - "ellinikahoaxes.gr", - "faktograf.id", - "stopfake.kz", - "pesacheck.org", - "dubawa.org", - "namibiafactcheck.org.na", - "zimfact.org", - "ghanafact.com", - "factspace.africa", - "factcrescendo.com", - "vishvasnews.com", - "factcheck.lk", - "newschecker.in", - "boomlive.in", - "digiteye.in", - "indiatoday.in/fact-check", - "factcrescendo.com", - "piyasa.com/fact-check", - "taiwanese.facts.news", - "taiwanfactcheck.com", - "mygopen.com", - "tfc-taiwan.org.tw", - "cofacts.tw", - "rumor.taipei", - "fact.qq.com", - "factcheck.afp.com/list", - "acfta.org", - "crosscheck.firstdraftnews.org", - "healthfeedback.org", - "climatefeedback.org", - "sciencefeedback.co", - "factcheck.aap.com.au", - "emergent.info", - "hoax-slayer.net", - "truthorfiction.com", - "factcheck.media", - "mediawise.org", - "thejournal.ie/factcheck", - "journalistsresource.org", - "metafact.io", - "reporterslab.org/fact-checking" -] + "snopes.com", + "politifact.com", + "factcheck.org", + "reuters.com/fact-check", + "apnews.com/hub/ap-fact-check", + "bbc.com/news/reality_check", + "fullfact.org", + "afp.com/fact-check", + "truthorfiction.com", + "leadstories.com", + "checkyourfact.com", + "washingtonpost.com/news/fact-checker", + "factcheck.kz", + "poynter.org/ifcn", + "factcheckeu.info", + "africacheck.org", + "thequint.com/webqoof", + "altnews.in", + "facta.news", + "factcheckni.org", + "mythdetector.ge", + "verificado.mx", + "euvsdisinfo.eu", + "factcheck.afp.com", + "newtral.es", + "maldita.es", + "faktograf.hr", + "demagog.org.pl", + "factnameh.com", + "faktiskt.se", + "teyit.org", + "factly.in", + "boom.live", + "stopfake.org", + "factcheck.ge", + "factcheck.kg", + "factcheck.uz", + "factcheck.tj", + "factcheck.az", + "factcheck.am", + "factcheck.md", + "verafiles.org", + "rappler.com/fact-check", + "vera.com.gt", + "chequeado.com", + "aosfatos.org", + "lasillavacia.com/detector-mentiras", + "colombiacheck.com", + "ecuadorchequea.com", + "elsurti.com/checado", + "verificat.cat", + "mafindo.or.id", + "tempo.co/cek-fakta", + "factcheck.mk", + "raskrinkavanje.ba", + "faktograf.hr", + "demagog.cz", + "faktabaari.fi", + "correctiv.org", + "mimikama.at", + "factcheck.vlaanderen", + "factuel.afp.com", + "nieuwscheckers.nl", + "faktisk.no", + "tjekdet.dk", + "ellinikahoaxes.gr", + "faktograf.id", + "stopfake.kz", + "pesacheck.org", + "dubawa.org", + "namibiafactcheck.org.na", + "zimfact.org", + "ghanafact.com", + "factspace.africa", + "factcrescendo.com", + "vishvasnews.com", + "factcheck.lk", + "newschecker.in", + "boomlive.in", + "digiteye.in", + "indiatoday.in/fact-check", + "factcrescendo.com", + "piyasa.com/fact-check", + "taiwanese.facts.news", + "taiwanfactcheck.com", + "mygopen.com", + "tfc-taiwan.org.tw", + "cofacts.tw", + "rumor.taipei", + "fact.qq.com", + "factcheck.afp.com/list", + "acfta.org", + "crosscheck.firstdraftnews.org", + "healthfeedback.org", + "climatefeedback.org", + "sciencefeedback.co", + "factcheck.aap.com.au", + "emergent.info", + "hoax-slayer.net", + "truthorfiction.com", + "factcheck.media", + "mediawise.org", + "thejournal.ie/factcheck", + "journalistsresource.org", + "metafact.io", + "reporterslab.org/fact-checking", + ] ], "news_sites": [ FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2) @@ -133,16 +138,14 @@ SOURCES = { "www.risingbd.com/english", "www.dailyindustry.news", "www.bangladeshpost.net", - "www.daily-bangladesh.com/english" + "www.daily-bangladesh.com/english", ] - ] + ], } + async def fetch_fact_checks( - api_key: str, - base_url: str, - query: str, - site: FactCheckSource + api_key: str, base_url: str, query: str, site: FactCheckSource ) -> Dict: """ Fetch fact checks from a specific site using the Google Fact Check API @@ -156,9 +159,9 @@ async def fetch_fact_checks( "query": query, "languageCode": "en-US", "reviewPublisherSiteFilter": site.domain, - "pageSize": 10 + "pageSize": 10, } - + response = requests.get(base_url, params=params) response.raise_for_status() return response.json() @@ -168,23 +171,22 @@ async def fetch_fact_checks( detail=ErrorResponse( detail=f"Error fetching from {site.domain}: {str(e)}", error_code="FACT_CHECK_SERVICE_ERROR", - path="/check-facts" - ).dict() + path="/check-facts", + ).dict(), ) except ValueError as e: raise HTTPException( status_code=500, detail=ErrorResponse( - detail=str(e), - error_code="CONFIGURATION_ERROR", - path="/check-facts" - ).dict() + detail=str(e), error_code="CONFIGURATION_ERROR", path="/check-facts" + ).dict(), ) + def get_all_sources() -> List[FactCheckSource]: """ Get all sources sorted by priority """ # all_sources = SOURCES["fact_checkers"] + SOURCES["news_sites"] - all_sources = SOURCES["fact_checkers"] - return sorted(all_sources, key=lambda x: x.priority) \ No newline at end of file + all_sources = SOURCES["fact_checkers"] + return sorted(all_sources, key=lambda x: x.priority) diff --git a/main.py b/main.py index 25d68c4..7048f3b 100644 --- a/main.py +++ b/main.py @@ -7,9 +7,7 @@ from app.config import FRONTEND_URL # Initialize FastAPI app app = FastAPI( - title="Your API Title", - description="Your API Description", - version="1.0.0" + title="Your API Title", description="Your API Description", version="1.0.0" ) # CORS configuration @@ -30,16 +28,19 @@ app.add_middleware( allow_headers=["*"], ) + # Basic root endpoint @app.get("/") async def root(): return {"message": "Welcome to your FastAPI application"} + # Health check endpoint @app.get("/health") async def health_check(): return {"status": "healthy"} + app.include_router(fact_check_router, prefix="") app.include_router(aifact_check_router, prefix="") app.include_router(scrap_websites_router, prefix="") @@ -50,4 +51,5 @@ app.include_router(scrap_websites_router, prefix="") if __name__ == "__main__": import uvicorn - uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) \ No newline at end of file + + uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) diff --git a/tests/test_main.py b/tests/test_main.py index 2298bfc..e71e19a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -3,16 +3,19 @@ from main import app client = TestClient(app) + def test_root_endpoint(): response = client.get("/") assert response.status_code == 200 assert response.json() == {"message": "Welcome to your FastAPI application"} + def test_health_endpoint(): response = client.get("/health") assert response.status_code == 200 assert response.json() == {"status": "healthy"} + def test_cors_headers(): response = client.get("/", headers={"Origin": "http://localhost:5173"}) - assert response.headers["access-control-allow-origin"] == "http://localhost:5173" \ No newline at end of file + assert response.headers["access-control-allow-origin"] == "http://localhost:5173"