diff --git a/.gitignore b/.gitignore index 21d6e87..cd4609c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ env .env test.py -/__pycache__/ \ No newline at end of file +__pycache__ \ No newline at end of file diff --git a/app/__pycache__/config.cpython-312.pyc b/app/__pycache__/config.cpython-312.pyc index 53d89a6..b086fe1 100644 Binary files a/app/__pycache__/config.cpython-312.pyc and b/app/__pycache__/config.cpython-312.pyc differ diff --git a/app/api/__pycache__/fact_check.cpython-312.pyc b/app/api/__pycache__/fact_check.cpython-312.pyc index daa45a8..b5709d2 100644 Binary files a/app/api/__pycache__/fact_check.cpython-312.pyc and b/app/api/__pycache__/fact_check.cpython-312.pyc differ diff --git a/app/api/ai_fact_check.py b/app/api/ai_fact_check.py new file mode 100644 index 0000000..6d1f2d7 --- /dev/null +++ b/app/api/ai_fact_check.py @@ -0,0 +1,110 @@ +from fastapi import APIRouter, HTTPException +from app.services.openai_client import OpenAIClient, AIFactChecker +from app.config import OPENAI_API_KEY +from app.models.ai_fact_check_models import ( + AIFactCheckRequest, + AIFactCheckResponse, + VerificationResult, + TokenUsage, + ErrorResponse +) +from urllib.parse import urlparse +import asyncio + +# Initialize router and OpenAI client +aifact_check_router = APIRouter() +openai_client = OpenAIClient(api_key=OPENAI_API_KEY) +fact_checker = AIFactChecker(openai_client=openai_client) + +@aifact_check_router.post( + "/aicheck-facts", + response_model=AIFactCheckResponse, + responses={ + 400: {"model": ErrorResponse}, + 500: {"model": ErrorResponse} + } +) +async def ai_fact_check(request: AIFactCheckRequest): + """ + Endpoint to fact-check a given statement based on multiple webpage URLs. + Input: + - urls: List of webpage URLs to analyze (with or without http/https) + - content: The fact statement to verify + Response: + - JSON response with verification results per URL, sources, and token usage + """ + try: + results = {} + all_sources = set() + all_contexts = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + + # Process all URLs concurrently + tasks = [ + fact_checker.check_fact(url=url, query=request.content) + for url in request.urls + ] + fact_check_results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for url, result in zip(request.urls, fact_check_results): + if isinstance(result, Exception): + # Handle failed URL checks + results[url] = VerificationResult( + verdict="Error", + confidence="Low", + evidence=f"Error checking URL: {str(result)}", + reasoning="URL processing failed", + missing_info="Could not access or process the URL" + ) + continue + + verification_result = VerificationResult( + verdict=result["verification_result"]["verdict"], + confidence=result["verification_result"]["confidence"], + evidence=result["verification_result"]["evidence"], + reasoning=result["verification_result"]["reasoning"], + missing_info=result["verification_result"].get("missing_info", None) + ) + + results[url] = verification_result + all_sources.update(result["sources"]) + + # Accumulate token usage + total_prompt_tokens += result["token_usage"]["prompt_tokens"] + total_completion_tokens += result["token_usage"]["completion_tokens"] + total_tokens += result["token_usage"]["total_tokens"] + + token_usage = TokenUsage( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens + ) + + return AIFactCheckResponse( + query=request.content, + verification_result=results, + sources=list(all_sources), + token_usage=token_usage + ) + + except ValueError as e: + raise HTTPException( + status_code=400, + detail=ErrorResponse( + detail=str(e), + error_code="INVALID_URL", + path="/aicheck-facts" + ).dict() + ) + except Exception as e: + raise HTTPException( + status_code=500, + detail=ErrorResponse( + detail=f"Error processing fact-check request: {str(e)}", + error_code="PROCESSING_ERROR", + path="/aicheck-facts" + ).dict() + ) \ No newline at end of file diff --git a/app/api/fact_check.py b/app/api/fact_check.py index 3e7a12d..b52ef24 100644 --- a/app/api/fact_check.py +++ b/app/api/fact_check.py @@ -1,291 +1,192 @@ from fastapi import APIRouter, HTTPException -from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict -from typing import Dict, List, Optional, Union -import requests -from enum import Enum -from datetime import datetime -import json -from app.config import GOOGLE_FACT_CHECK_API_KEY, GOOGLE_FACT_CHECK_BASE_URL +import httpx +from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY +from app.api.scrap_websites import search_websites, SearchRequest +from app.services.openai_client import OpenAIClient +from app.models.fact_check_models import ( + FactCheckRequest, + FactCheckResponse, + ErrorResponse, + Source +) +from app.websites.fact_checker_website import get_all_sources fact_check_router = APIRouter() +openai_client = OpenAIClient(OPENAI_API_KEY) -class CustomJSONEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, datetime): - return obj.isoformat() - return super().default(obj) +async def generate_fact_report(query: str, fact_check_data: dict) -> FactCheckResponse: + """Generate a fact check report using OpenAI based on the fact check results.""" + try: + base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources. -class ErrorResponse(BaseModel): - detail: str - error_code: str = Field(..., description="Unique error code for this type of error") - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - path: Optional[str] = Field(None, description="The endpoint path where error occurred") +Rules: +1. Include all source URLs and names in the sources list +2. Keep the explanation focused on verifiable facts +3. Include dates when available +4. Maintain objectivity in the report""" - model_config = ConfigDict(json_schema_extra={ - "example": { - "detail": "Error description", - "error_code": "ERROR_CODE", - "timestamp": "2024-12-09T16:49:30.905765", - "path": "/check-facts" + base_user_prompt = """Generate a comprehensive fact check report in this exact JSON format: +{ + "claim": "Write the exact claim being verified", + "verdict": "One of: True/False/Partially True/Unverified", + "confidence": "One of: High/Medium/Low", + "sources": [ + { + "url": "Full URL of the source", + "name": "Name of the source organization" } - }) - -class RequestValidationError(BaseModel): - loc: List[str] - msg: str - type: str - -class Publisher(BaseModel): - name: str - site: Optional[str] = Field(None, description="Publisher's website") - - @validator('site') - def validate_site(cls, v): - if v and not (v.startswith('http://') or v.startswith('https://')): - return f"https://{v}" - return v - -class ClaimReview(BaseModel): - publisher: Publisher - url: Optional[HttpUrl] = None - title: Optional[str] = None - reviewDate: Optional[str] = None - textualRating: Optional[str] = None - languageCode: str = Field(default="en-US") - -class Claim(BaseModel): - text: str - claimant: Optional[str] = None - claimDate: Optional[str] = None - claimReview: List[ClaimReview] - -class FactCheckResponse(BaseModel): - query: str = Field(..., description="Original query that was fact-checked") - total_claims_found: int = Field(..., ge=0) - results: List[Claim] = Field(default_factory=list) - summary: Dict[str, int] = Field(...) - - model_config = ConfigDict(json_schema_extra={ - "example": { - "query": "Example claim", - "total_claims_found": 1, - "results": [{ - "text": "Example claim text", - "claimant": "Source name", - "claimReview": [{ - "publisher": { - "name": "Fact Checker", - "site": "factchecker.com" - }, - "textualRating": "True" - }] - }], - "summary": { - "total_sources": 1, - "fact_checking_sites_queried": 10 - } - } - }) - -class SourceType(str, Enum): - FACT_CHECKER = "fact_checker" - NEWS_SITE = "news_site" - -class FactCheckSource(BaseModel): - domain: str - type: SourceType - priority: int = Field(default=1, ge=1, le=10) - - model_config = ConfigDict(json_schema_extra={ - "example": { - "domain": "factcheck.org", - "type": "fact_checker", - "priority": 1 - } - }) - -# Sources configuration with validation -SOURCES = { - "fact_checkers": [ - FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1) - for domain in [ - "factcheck.org", - "snopes.com", - "politifact.com", - "reuters.com", - "bbc.com", - "apnews.com", - "usatoday.com", - "nytimes.com", - "washingtonpost.com", - "afp.com", - "fullfact.org", - "truthorfiction.com", - "leadstories.com", - "altnews.in", - "boomlive.in", - "en.prothomalo.com" - ] ], - "news_sites": [ - FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2) - for domain in [ - "www.thedailystar.net", - "www.thefinancialexpress.com.bd", - "www.theindependentbd.com", - "www.dhakatribune.com", - "www.newagebd.net", - "www.observerbd.com", - "www.daily-sun.com", - "www.tbsnews.net", - "www.businesspostbd.com", - "www.banglanews24.com/english", - "www.bdnews24.com/english", - "www.risingbd.com/english", - "www.dailyindustry.news", - "www.bangladeshpost.net", - "www.daily-bangladesh.com/english" - ] - ] + "evidence": "A concise summary of the key evidence (1-2 sentences)", + "explanation": "A detailed explanation including who verified it, when it was verified, and the key findings (2-3 sentences)", + "additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)" } -class FactCheckRequest(BaseModel): - content: str = Field( - ..., - min_length=10, - max_length=1000, - description="The claim to be fact-checked" - ) - language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$") - max_results_per_source: int = Field(default=10, ge=1, le=50) +Ensure all URLs in sources are complete (including https:// if missing) and each source has both a URL and name.""" - @validator('content') - def validate_content(cls, v): - if not v.strip(): - raise ValueError("Content cannot be empty or just whitespace") - return v.strip() + if "claims" in fact_check_data: + system_prompt = base_system_prompt + user_prompt = f"""Query: {query} + Fact Check Results: {fact_check_data} + + {base_user_prompt} -async def fetch_fact_checks( - api_key: str, - base_url: str, - query: str, - site: FactCheckSource -) -> Dict: - """ - Fetch fact checks from a specific site using the Google Fact Check API - """ - try: - if not api_key or not base_url: - raise ValueError("API key or base URL not configured") + The report should: + 1. Include ALL source URLs and organization names + 2. Specify verification dates when available + 3. Name the fact-checking organizations involved + 4. Describe the verification process""" + + else: + system_prompt = base_system_prompt + user_prompt = f"""Query: {query} + Fact Check Results: {fact_check_data} + + {base_user_prompt} - params = { - "key": api_key, - "query": query, - "languageCode": "en-US", - "reviewPublisherSiteFilter": site.domain, - "pageSize": 10 - } + The report should: + 1. Include ALL source URLs and names from both verification_result and sources fields + 2. Mention all fact-checking organizations involved + 3. Describe the verification process + 4. Note any conflicting information between sources""" + + response = await openai_client.generate_text_response( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=1000 + ) - response = requests.get(base_url, params=params) - response.raise_for_status() - return response.json() - except requests.RequestException as e: - raise HTTPException( - status_code=503, - detail=ErrorResponse( - detail=f"Error fetching from {site.domain}: {str(e)}", - error_code="FACT_CHECK_SERVICE_ERROR", - path="/check-facts" - ).dict() - ) - except ValueError as e: + try: + # First try to parse the response directly + response_data = response["response"] + + # Clean up sources before validation + if isinstance(response_data.get('sources'), list): + cleaned_sources = [] + for source in response_data['sources']: + if isinstance(source, str): + # Convert string sources to Source objects + url = source if source.startswith('http') else f"https://{source}" + cleaned_sources.append({ + "url": url, + "name": source + }) + elif isinstance(source, dict): + # Ensure URL has proper scheme + url = source.get('url', '') + if url and not url.startswith('http'): + source['url'] = f"https://{url}" + cleaned_sources.append(source) + response_data['sources'] = cleaned_sources + + fact_check_response = FactCheckResponse(**response_data) + return fact_check_response + + except Exception as validation_error: + print(f"Response validation error: {str(validation_error)}") + raise HTTPException( + status_code=422, + detail=ErrorResponse( + detail=f"Invalid response format: {str(validation_error)}", + error_code="VALIDATION_ERROR", + path="/check-facts" + ).dict() + ) + + except Exception as e: + print(f"Error generating fact report: {str(e)}") raise HTTPException( status_code=500, detail=ErrorResponse( - detail=str(e), + detail="Error generating fact report", + error_code="FACT_CHECK_ERROR", + path="/check-facts" + ).dict() + ) + +@fact_check_router.post("/check-facts", response_model=FactCheckResponse) +async def check_facts(request: FactCheckRequest): + """ + Fetch fact check results and generate a comprehensive report. + """ + if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL: + raise HTTPException( + status_code=500, + detail=ErrorResponse( + detail="Google API key or base URL is not configured", error_code="CONFIGURATION_ERROR", path="/check-facts" ).dict() ) -@fact_check_router.post( - "/check-facts", - response_model=FactCheckResponse, - responses={ - 400: {"model": ErrorResponse}, - 404: {"model": ErrorResponse}, - 500: {"model": ErrorResponse}, - 503: {"model": ErrorResponse} - } -) -async def check_facts(request: FactCheckRequest) -> FactCheckResponse: - """ - Check facts using multiple fact-checking sources - """ - all_results = [] - - # Validate configuration - if not GOOGLE_FACT_CHECK_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL: - raise HTTPException( - status_code=500, - detail=ErrorResponse( - detail="API configuration is missing", - error_code="CONFIGURATION_ERROR", - path="/check-facts" - ).dict() - ) - - # Check all sources in priority order - all_sources = ( - SOURCES["fact_checkers"] + - SOURCES["news_sites"] - ) - all_sources.sort(key=lambda x: x.priority) - - for source in all_sources: + headers = {"Content-Type": "application/json"} + async with httpx.AsyncClient() as client: + # Get fact checker sources from the centralized configuration + fact_checker_sources = get_all_sources() + + for source in fact_checker_sources: + params = { + "key": GOOGLE_API_KEY, + "query": request.query, + "languageCode": "en-US", + "reviewPublisherSiteFilter": source.domain, + "pageSize": 10 + } + + try: + response = await client.get( + GOOGLE_FACT_CHECK_BASE_URL, + params=params, + headers=headers + ) + response.raise_for_status() + json_response = response.json() + + if json_response.get("claims"): + return await generate_fact_report(request.query, json_response) + + except httpx.RequestError as e: + print(f"Error fetching results for site {source.domain}: {str(e)}") + continue + except Exception as e: + print(f"Unexpected error for site {source.domain}: {str(e)}") + continue + try: - result = await fetch_fact_checks( - GOOGLE_FACT_CHECK_API_KEY, - GOOGLE_FACT_CHECK_BASE_URL, - request.content, - source + search_request = SearchRequest( + search_text=request.query, + source_types=["fact_checkers"] ) - if "claims" in result: - # Validate each claim through Pydantic - validated_claims = [ - Claim(**claim).dict() - for claim in result["claims"] - ] - all_results.extend(validated_claims) - - except HTTPException: - raise + ai_response = await search_websites(search_request) + return await generate_fact_report(request.query, ai_response) + except Exception as e: - # Log the error but continue with other sources - print(f"Error processing {source.domain}: {str(e)}") - continue - - if not all_results: - raise HTTPException( - status_code=404, - detail=ErrorResponse( - detail="No fact check results found", - error_code="NO_RESULTS_FOUND", - path="/check-facts" - ).dict() - ) - - # Create the response using Pydantic model - response = FactCheckResponse( - query=request.content, - total_claims_found=len(all_results), - results=all_results, - summary={ - "total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "") - for claim in all_results if claim.get("claimReview"))), - "fact_checking_sites_queried": len(all_sources) - } - ) - - return response \ No newline at end of file + print(f"Error in AI fact check: {str(e)}") + raise HTTPException( + status_code=404, + detail=ErrorResponse( + detail="No fact check results found", + error_code="NOT_FOUND", + path="/check-facts" + ).dict() + ) \ No newline at end of file diff --git a/app/api/scrap_websites.py b/app/api/scrap_websites.py new file mode 100644 index 0000000..946ec01 --- /dev/null +++ b/app/api/scrap_websites.py @@ -0,0 +1,160 @@ +from fastapi import APIRouter, HTTPException +import httpx +import logging +from urllib.parse import urlparse +from typing import List, Dict, Optional +from pydantic import BaseModel +from app.models.ai_fact_check_models import ( + AIFactCheckRequest, + FactCheckSource, + SourceType +) +from app.websites.fact_checker_website import SOURCES, get_all_sources +from app.api.ai_fact_check import ai_fact_check +from app.config import GOOGLE_API_KEY, GOOGLE_ENGINE_ID, GOOGLE_SEARCH_URL + + +class SearchRequest(BaseModel): + search_text: str + source_types: List[str] = ["fact_checkers"] + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +scrap_websites_router = APIRouter() + +# Constants +RESULTS_PER_PAGE = 10 +MAX_PAGES = 5 +MAX_URLS_PER_DOMAIN = 5 + + +def get_domain_from_url(url: str) -> str: + """Extract domain from URL with improved handling.""" + try: + parsed = urlparse(url) + domain = parsed.netloc.lower() + if domain.startswith('www.'): + domain = domain[4:] + return domain + except Exception as e: + logger.error(f"Error extracting domain from URL {url}: {str(e)}") + return "" + +def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool: + """Check if domain matches any source with improved matching logic.""" + if not domain: + return False + + domain = domain.lower() + if domain.startswith('www.'): + domain = domain[4:] + + for source in sources: + source_domain = source.domain.lower() + if source_domain.startswith('www.'): + source_domain = source_domain[4:] + + if domain == source_domain or domain.endswith('.' + source_domain): + return True + + return False + +async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str: + """Build search query with site restrictions.""" + site_queries = [f"site:{source.domain}" for source in sources] + site_restriction = " OR ".join(site_queries) + return f"({query}) ({site_restriction})" + +async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]: + """Perform Google Custom Search with enhanced query.""" + enhanced_query = await build_enhanced_search_query(query, sources) + start_index = ((page - 1) * RESULTS_PER_PAGE) + 1 + + params = { + "key": GOOGLE_API_KEY, + "cx": GOOGLE_ENGINE_ID, + "q": enhanced_query, + "num": RESULTS_PER_PAGE, + "start": start_index + } + + async with httpx.AsyncClient(timeout=30.0) as client: + try: + response = await client.get(GOOGLE_SEARCH_URL, params=params) + response.raise_for_status() + return response.json() + except Exception as e: + logger.error(f"Search error: {str(e)}") + raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") + +@scrap_websites_router.post("/search") +async def search_websites(request: SearchRequest): + # Get the source types from the request + source_types = request.source_types if request.source_types else ["fact_checkers"] + + # Get sources based on requested types + selected_sources = [] + for source_type in source_types: + if source_type in SOURCES: + selected_sources.extend(SOURCES[source_type]) + + # If no valid sources found, use fact checkers as default + if not selected_sources: + selected_sources = SOURCES["fact_checkers"] + + all_urls = [] + domain_results = {} + + try: + for page in range(1, MAX_PAGES + 1): + if len(all_urls) >= 50: + break + + search_response = await google_custom_search(request.search_text, selected_sources, page) + + if not search_response or not search_response.get("items"): + break + + for item in search_response.get("items", []): + url = item.get("link") + if not url: + continue + + domain = get_domain_from_url(url) + + if is_valid_source_domain(domain, selected_sources): + if domain not in domain_results: + domain_results[domain] = [] + + if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN: + domain_results[domain].append({ + "url": url, + "title": item.get("title", ""), + "snippet": item.get("snippet", "") + }) + all_urls.append(url) + + if len(all_urls) >= 50: + break + + if not all_urls: + return { + "status": "no_results", + "urls_found": 0 + } + + fact_check_request = AIFactCheckRequest( + content=request.search_text, + urls=all_urls[:5] + ) + + return await ai_fact_check(fact_check_request) + + except Exception as e: + logger.error(f"Error during search/fact-check process: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/app/config.py b/app/config.py index d9de9e9..b890247 100644 --- a/app/config.py +++ b/app/config.py @@ -3,8 +3,10 @@ from dotenv import load_dotenv load_dotenv() -GOOGLE_FACT_CHECK_API_KEY = os.environ["GOOGLE_FACT_CHECK_API_KEY"] +GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"] GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"] +GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"] +GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"] OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] FRONTEND_URL = os.environ["FRONTEND_URL"] \ No newline at end of file diff --git a/app/models/__pycache__/fact_check_models.cpython-312.pyc b/app/models/__pycache__/fact_check_models.cpython-312.pyc new file mode 100644 index 0000000..7cb8e9a Binary files /dev/null and b/app/models/__pycache__/fact_check_models.cpython-312.pyc differ diff --git a/app/models/ai_fact_check_models.py b/app/models/ai_fact_check_models.py new file mode 100644 index 0000000..0949e51 --- /dev/null +++ b/app/models/ai_fact_check_models.py @@ -0,0 +1,229 @@ +from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict +from typing import Dict, List, Optional, Any, Union +from enum import Enum +from datetime import datetime +from urllib.parse import urlparse + +# Common Models +class TokenUsage(BaseModel): + prompt_tokens: Optional[int] = 0 + completion_tokens: Optional[int] = 0 + total_tokens: Optional[int] = 0 + +class ErrorResponse(BaseModel): + detail: str + error_code: str = Field(..., description="Unique error code for this type of error") + timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) + path: Optional[str] = Field(None, description="The endpoint path where error occurred") + + model_config = ConfigDict(json_schema_extra={ + "example": { + "detail": "Error description", + "error_code": "ERROR_CODE", + "timestamp": "2024-12-09T16:49:30.905765", + "path": "/check-facts" + } + }) + +# Fact Check Models +class Publisher(BaseModel): + name: str + site: Optional[str] = Field(None, description="Publisher's website") + + @validator('site') + def validate_site(cls, v): + if v and not (v.startswith('http://') or v.startswith('https://')): + return f"https://{v}" + return v + +class ClaimReview(BaseModel): + publisher: Publisher + url: Optional[HttpUrl] = None + title: Optional[str] = None + reviewDate: Optional[str] = None + textualRating: Optional[str] = None + languageCode: str = Field(default="en-US") + +class Claim(BaseModel): + text: str + claimant: Optional[str] = None + claimDate: Optional[str] = None + claimReview: List[ClaimReview] + +class SourceType(str, Enum): + FACT_CHECKER = "fact_checker" + NEWS_SITE = "news_site" + +class FactCheckSource(BaseModel): + domain: str + type: SourceType + priority: int = Field(default=1, ge=1, le=10) + +# Verification Models +class VerificationResult(BaseModel): + verdict: str = Field(..., description="True/False/Insufficient Information") + confidence: str = Field(..., description="High/Medium/Low") + evidence: Union[str, List[str]] + reasoning: str + missing_info: Optional[str] = None + + model_config = ConfigDict(json_schema_extra={ + "example": { + "verdict": "True", + "confidence": "High", + "evidence": ["Direct quote from source supporting the claim"], + "reasoning": "Detailed analysis of why the claim is considered true", + "missing_info": "Any caveats or limitations of the verification" + } + }) + +# Request Models +class BaseFactCheckRequest(BaseModel): + content: str = Field( + ..., + min_length=10, + max_length=1000, + description="The claim to be fact-checked" + ) + + @validator('content') + def validate_content(cls, v): + if not v.strip(): + raise ValueError("Content cannot be empty or just whitespace") + return v.strip() + +class GoogleFactCheckRequest(BaseFactCheckRequest): + language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$") + max_results_per_source: int = Field(default=10, ge=1, le=50) + +class AIFactCheckRequest(BaseFactCheckRequest): + urls: List[str] = Field( + ..., + min_items=1, + max_items=5, + description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing" + ) + + @validator('urls') + def validate_urls(cls, urls): + validated_urls = [] + for url in urls: + if not url.strip(): + raise ValueError("URL cannot be empty") + + # Add https:// if no protocol specified + if not url.startswith(('http://', 'https://')): + url = f'https://{url}' + + try: + result = urlparse(url) + if not result.netloc: + raise ValueError(f"Invalid URL structure for {url}") + validated_urls.append(url) + except Exception as e: + raise ValueError(f"Invalid URL {url}: {str(e)}") + + return validated_urls + + model_config = ConfigDict(json_schema_extra={ + "example": { + "content": "Indian flag was drawn in BUET campus", + "urls": [ + "www.altnews.in/article-about-flag", + "www.another-source.com/related-news" + ] + } + }) + +# Response Models +class BaseFactCheckResponse(BaseModel): + query: str + token_usage: TokenUsage + sources: List[str] + + model_config = ConfigDict(json_schema_extra={ + "example": { + "query": "Example statement to verify", + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + }, + "sources": ["source1.com", "source2.com"], + } + }) + +class GoogleFactCheckResponse(BaseFactCheckResponse): + total_claims_found: int + results: List[Dict[str, Any]] + verification_result: Dict[str, Any] + summary: Dict[str, int] + + model_config = ConfigDict(json_schema_extra={ + "example": { + "query": "Example claim", + "total_claims_found": 1, + "results": [{ + "text": "Example claim text", + "claimant": "Source name", + "claimReview": [{ + "publisher": { + "name": "Fact Checker", + "site": "factchecker.com" + }, + "textualRating": "True" + }] + }], + "verification_result": { + "verdict": "True", + "confidence": "High", + "evidence": ["Supporting evidence"], + "reasoning": "Detailed analysis" + }, + "sources": ["factchecker.com"], + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + }, + "summary": { + "total_sources": 1, + "fact_checking_sites_queried": 10 + } + } + }) + +class AIFactCheckResponse(BaseFactCheckResponse): + verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL + + model_config = ConfigDict(json_schema_extra={ + "example": { + "query": "Indian flag was drawn in BUET campus", + "verification_result": { + "https://www.source1.com": { + "verdict": "True", + "confidence": "High", + "evidence": ["Supporting evidence from source 1"], + "reasoning": "Detailed analysis from source 1", + "missing_info": None + }, + "https://www.source2.com": { + "verdict": "True", + "confidence": "Medium", + "evidence": ["Supporting evidence from source 2"], + "reasoning": "Analysis from source 2", + "missing_info": "Additional context needed" + } + }, + "sources": ["source1.com", "source2.com"], + "token_usage": { + "prompt_tokens": 200, + "completion_tokens": 100, + "total_tokens": 300 + } + } + }) + +# Backwards compatibility aliases +FactCheckRequest = GoogleFactCheckRequest +FactCheckResponse = GoogleFactCheckResponse \ No newline at end of file diff --git a/app/models/fact_check_models.py b/app/models/fact_check_models.py new file mode 100644 index 0000000..1b30511 --- /dev/null +++ b/app/models/fact_check_models.py @@ -0,0 +1,101 @@ +from pydantic import BaseModel, Field, HttpUrl, validator +from typing import List, Literal, Union +from datetime import datetime +from enum import Enum + +class VerdictEnum(str, Enum): + TRUE = "True" + FALSE = "False" + PARTIALLY_TRUE = "Partially True" + UNVERIFIED = "Unverified" + +class ConfidenceEnum(str, Enum): + HIGH = "High" + MEDIUM = "Medium" + LOW = "Low" + +class FactCheckRequest(BaseModel): + query: str = Field( + ..., + min_length=3, + max_length=500, + description="The claim or statement to be fact-checked", + example="Did NASA confirm finding alien structures on Mars in 2024?" + ) + +class Source(BaseModel): + url: str + name: str = "" + + @validator('url') + def validate_url(cls, v): + # Basic URL validation without requiring HTTP/HTTPS + if not v or len(v) < 3: + raise ValueError("URL must not be empty and must be at least 3 characters") + return v + +class FactCheckResponse(BaseModel): + claim: str = Field( + ..., + min_length=10, + max_length=1000, + description="The exact claim being verified" + ) + verdict: VerdictEnum = Field( + ..., + description="The verification verdict" + ) + confidence: ConfidenceEnum = Field( + ..., + description="Confidence level in the verdict" + ) + sources: List[Source] = Field( + ..., + min_items=1, + description="List of sources used in verification" + ) + evidence: str = Field( + ..., + min_length=20, + max_length=500, + description="Concise summary of key evidence" + ) + explanation: str = Field( + ..., + min_length=50, + max_length=1000, + description="Detailed explanation of verification findings" + ) + additional_context: str = Field( + ..., + min_length=20, + max_length=500, + description="Important context about the verification" + ) + + class Config: + json_schema_extra = { + "example": { + "claim": "NASA confirmed finding alien structures on Mars in 2024", + "verdict": "False", + "confidence": "High", + "sources": [ + { + "url": "https://www.nasa.gov/mars-exploration", + "name": "NASA Mars Exploration" + }, + { + "url": "https://factcheck.org/2024/mars-claims", + "name": "FactCheck.org" + } + ], + "evidence": "NASA has made no such announcement. Recent Mars rover images show natural rock formations.", + "explanation": "Multiple fact-checking organizations investigated this claim. NASA's official communications and Mars mission reports from 2024 contain no mention of alien structures. The viral images being shared are misidentified natural geological formations.", + "additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images." + } + } + +class ErrorResponse(BaseModel): + detail: str + error_code: str = Field(..., example="VALIDATION_ERROR") + path: str = Field(..., example="/check-facts") \ No newline at end of file diff --git a/app/models/scrap_websites_models.py b/app/models/scrap_websites_models.py new file mode 100644 index 0000000..1c629c5 --- /dev/null +++ b/app/models/scrap_websites_models.py @@ -0,0 +1,43 @@ +from pydantic import BaseModel +from typing import List, Dict + +class SearchRequest(BaseModel): + search_text: str + source_types: List[str] = ["fact_checkers"] + +class Publisher(BaseModel): + name: str + site: str + +class ClaimReview(BaseModel): + publisher: Publisher + textualRating: str + +class Claim(BaseModel): + claimReview: List[ClaimReview] + claimant: str + text: str + +class Summary(BaseModel): + fact_checking_sites_queried: int + total_sources: int + +class TokenUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + +class VerificationResult(BaseModel): + verdict: str + confidence: str + evidence: List[str] + reasoning: str + +class EnhancedFactCheckResponse(BaseModel): + query: str + results: List[Claim] + sources: List[str] + summary: Summary + token_usage: Dict[str, int] + total_claims_found: int + verification_result: VerificationResult \ No newline at end of file diff --git a/app/services/openai_client.py b/app/services/openai_client.py new file mode 100644 index 0000000..07b6ae3 --- /dev/null +++ b/app/services/openai_client.py @@ -0,0 +1,172 @@ +from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_community.document_transformers import BeautifulSoupTransformer +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.documents import Document +from typing import List, Dict, Any +import numpy as np +import logging as logger +import openai +import json + +class OpenAIClient: + def __init__(self, api_key: str): + """ + Initialize OpenAI client with the provided API key. + """ + openai.api_key = api_key + + async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict: + """ + Generate a response using OpenAI's chat completion API. + """ + try: + response = openai.ChatCompletion.create( + model="gpt-4", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + max_tokens=max_tokens + ) + content = response['choices'][0]['message']['content'] + # Parse the JSON string into a dictionary + parsed_content = json.loads(content) + + return { + "response": parsed_content, # Now returns a dictionary instead of string + "prompt_tokens": response['usage']['prompt_tokens'], + "completion_tokens": response['usage']['completion_tokens'], + "total_tokens": response['usage']['total_tokens'] + } + except json.JSONDecodeError as e: + raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}") + except Exception as e: + raise Exception(f"OpenAI text generation error: {str(e)}") + + def get_embeddings(self, texts: List[str]) -> List[List[float]]: + """ + Retrieve embeddings for a list of texts using OpenAI's embedding API. + """ + try: + response = openai.Embedding.create( + input=texts, + model="text-embedding-ada-002" + ) + embeddings = [data['embedding'] for data in response['data']] + return embeddings + except Exception as e: + raise Exception(f"OpenAI embedding error: {str(e)}") + +class AIFactChecker: + def __init__(self, openai_client: OpenAIClient): + """Initialize the fact checker with OpenAI client.""" + self.openai_client = openai_client + self.text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200, + length_function=len, + separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] + ) + + async def scrape_webpage(self, url: str) -> List[Document]: + """Scrape webpage content using LangChain's AsyncHtmlLoader.""" + try: + loader = AsyncHtmlLoader([url]) + docs = await loader.aload() + + bs_transformer = BeautifulSoupTransformer() + docs_transformed = bs_transformer.transform_documents(docs) + docs_chunks = self.text_splitter.split_documents(docs_transformed) + + logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}") + return docs_chunks + + except Exception as e: + logger.error(f"Error scraping webpage | url={url} | error={str(e)}") + raise + + def find_relevant_chunks( + self, + query_embedding: List[float], + doc_embeddings: List[List[float]], + docs: List[Document] + ) -> List[Document]: + """Find most relevant document chunks using cosine similarity.""" + try: + query_array = np.array(query_embedding) + chunks_array = np.array(doc_embeddings) + + similarities = np.dot(chunks_array, query_array) / ( + np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array) + ) + + top_indices = np.argsort(similarities)[-5:][::-1] + return [docs[i] for i in top_indices] + + except Exception as e: + logger.error(f"Error finding relevant chunks | error={str(e)}") + raise + + async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]: + """Verify fact using OpenAI's API with context from relevant documents.""" + try: + context = "\n\n".join([doc.page_content for doc in relevant_docs]) + + system_prompt = """You are a professional fact-checking assistant. Analyze the provided context + and determine if the given statement is true, false, or if there isn't enough information. + + Provide your response in the following JSON format: + { + "verdict": "True/False/Insufficient Information", + "confidence": "High/Medium/Low", + "evidence": "Direct quotes or evidence from the context", + "reasoning": "Your detailed analysis and reasoning", + "missing_info": "Any important missing information (if applicable)" + }""" + + user_prompt = f"""Context: + {context} + + Statement to verify: "{query}" + + Analyze the statement based on the provided context and return your response in the specified JSON format.""" + + response = await self.openai_client.generate_text_response( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=800 + ) + + sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs])) + + return { + "verification_result": response["response"], # This is now a dictionary + "sources": sources, + "token_usage": { + "prompt_tokens": response["prompt_tokens"], + "completion_tokens": response["completion_tokens"], + "total_tokens": response["total_tokens"] + } + } + + except Exception as e: + logger.error(f"Error verifying fact | error={str(e)}") + raise + + async def check_fact(self, url: str, query: str) -> Dict[str, Any]: + """Main method to check a fact against a webpage.""" + try: + docs = await self.scrape_webpage(url) + + doc_texts = [doc.page_content for doc in docs] + doc_embeddings = self.openai_client.get_embeddings(doc_texts) + query_embedding = self.openai_client.get_embeddings([query]) + + relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs) + verification_result = await self.verify_fact(query, relevant_docs) + + return verification_result + + except Exception as e: + logger.error(f"Error checking fact | error={str(e)}") + raise \ No newline at end of file diff --git a/app/websites/__pycache__/fact_checker_website.cpython-312.pyc b/app/websites/__pycache__/fact_checker_website.cpython-312.pyc new file mode 100644 index 0000000..b0b0fa4 Binary files /dev/null and b/app/websites/__pycache__/fact_checker_website.cpython-312.pyc differ diff --git a/app/websites/fact_checker_website.py b/app/websites/fact_checker_website.py new file mode 100644 index 0000000..2e4934b --- /dev/null +++ b/app/websites/fact_checker_website.py @@ -0,0 +1,190 @@ +from typing import Dict, List +import requests +from fastapi import HTTPException +from app.models.ai_fact_check_models import FactCheckSource, ErrorResponse, FactCheckRequest, SourceType + +# Sources configuration with validation +SOURCES = { + "fact_checkers": [ + FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1) + for domain in [ + "snopes.com", + "politifact.com", + "factcheck.org", + "reuters.com/fact-check", + "apnews.com/hub/ap-fact-check", + "bbc.com/news/reality_check", + "fullfact.org", + "afp.com/fact-check", + "truthorfiction.com", + "leadstories.com", + "checkyourfact.com", + "washingtonpost.com/news/fact-checker", + "factcheck.kz", + "poynter.org/ifcn", + "factcheckeu.info", + "africacheck.org", + "thequint.com/webqoof", + "altnews.in", + "facta.news", + "factcheckni.org", + "mythdetector.ge", + "verificado.mx", + "euvsdisinfo.eu", + "factcheck.afp.com", + "newtral.es", + "maldita.es", + "faktograf.hr", + "demagog.org.pl", + "factnameh.com", + "faktiskt.se", + "teyit.org", + "factly.in", + "boom.live", + "stopfake.org", + "factcheck.ge", + "factcheck.kg", + "factcheck.uz", + "factcheck.tj", + "factcheck.az", + "factcheck.am", + "factcheck.md", + "verafiles.org", + "rappler.com/fact-check", + "vera.com.gt", + "chequeado.com", + "aosfatos.org", + "lasillavacia.com/detector-mentiras", + "colombiacheck.com", + "ecuadorchequea.com", + "elsurti.com/checado", + "verificat.cat", + "mafindo.or.id", + "tempo.co/cek-fakta", + "factcheck.mk", + "raskrinkavanje.ba", + "faktograf.hr", + "demagog.cz", + "faktabaari.fi", + "correctiv.org", + "mimikama.at", + "factcheck.vlaanderen", + "factuel.afp.com", + "nieuwscheckers.nl", + "faktisk.no", + "tjekdet.dk", + "ellinikahoaxes.gr", + "faktograf.id", + "stopfake.kz", + "pesacheck.org", + "dubawa.org", + "namibiafactcheck.org.na", + "zimfact.org", + "ghanafact.com", + "factspace.africa", + "factcrescendo.com", + "vishvasnews.com", + "factcheck.lk", + "newschecker.in", + "boomlive.in", + "digiteye.in", + "indiatoday.in/fact-check", + "factcrescendo.com", + "piyasa.com/fact-check", + "taiwanese.facts.news", + "taiwanfactcheck.com", + "mygopen.com", + "tfc-taiwan.org.tw", + "cofacts.tw", + "rumor.taipei", + "fact.qq.com", + "factcheck.afp.com/list", + "acfta.org", + "crosscheck.firstdraftnews.org", + "healthfeedback.org", + "climatefeedback.org", + "sciencefeedback.co", + "factcheck.aap.com.au", + "emergent.info", + "hoax-slayer.net", + "truthorfiction.com", + "factcheck.media", + "mediawise.org", + "thejournal.ie/factcheck", + "journalistsresource.org", + "metafact.io", + "reporterslab.org/fact-checking" +] + ], + "news_sites": [ + FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2) + for domain in [ + "www.thedailystar.net", + "www.thefinancialexpress.com.bd", + "www.theindependentbd.com", + "www.dhakatribune.com", + "www.newagebd.net", + "www.observerbd.com", + "www.daily-sun.com", + "www.tbsnews.net", + "www.businesspostbd.com", + "www.banglanews24.com/english", + "www.bdnews24.com/english", + "www.risingbd.com/english", + "www.dailyindustry.news", + "www.bangladeshpost.net", + "www.daily-bangladesh.com/english" + ] + ] +} + +async def fetch_fact_checks( + api_key: str, + base_url: str, + query: str, + site: FactCheckSource +) -> Dict: + """ + Fetch fact checks from a specific site using the Google Fact Check API + """ + try: + if not api_key or not base_url: + raise ValueError("API key or base URL not configured") + + params = { + "key": api_key, + "query": query, + "languageCode": "en-US", + "reviewPublisherSiteFilter": site.domain, + "pageSize": 10 + } + + response = requests.get(base_url, params=params) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + raise HTTPException( + status_code=503, + detail=ErrorResponse( + detail=f"Error fetching from {site.domain}: {str(e)}", + error_code="FACT_CHECK_SERVICE_ERROR", + path="/check-facts" + ).dict() + ) + except ValueError as e: + raise HTTPException( + status_code=500, + detail=ErrorResponse( + detail=str(e), + error_code="CONFIGURATION_ERROR", + path="/check-facts" + ).dict() + ) + +def get_all_sources() -> List[FactCheckSource]: + """ + Get all sources sorted by priority + """ + # all_sources = SOURCES["fact_checkers"] + SOURCES["news_sites"] + all_sources = SOURCES["fact_checkers"] + return sorted(all_sources, key=lambda x: x.priority) \ No newline at end of file diff --git a/main.py b/main.py index 6b79e28..25d68c4 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from app.api.fact_check import fact_check_router +from app.api.ai_fact_check import aifact_check_router +from app.api.scrap_websites import scrap_websites_router from app.config import FRONTEND_URL # Initialize FastAPI app @@ -39,6 +41,8 @@ async def health_check(): return {"status": "healthy"} app.include_router(fact_check_router, prefix="") +app.include_router(aifact_check_router, prefix="") +app.include_router(scrap_websites_router, prefix="") # Include routers (uncomment and modify as needed) # from routes import some_router diff --git a/search_response_altnews_in.html b/search_response_altnews_in.html new file mode 100644 index 0000000..a3ee576 --- /dev/null +++ b/search_response_altnews_in.html @@ -0,0 +1,28 @@ +q="Indian flag was drawn in BUET campus" site:altnews.in - Google Search

অ্যাক্সেসিবিলিটি লিঙ্ক

সরাসরি আসল কন্টেন্টে যানঅ্যাক্সেসিবিলিটির ক্ষেত্রে সহায়তা
অ্যাক্সেসিবিলিটির বিষয়ে মতামত
প্রায় ০টি ফলাফল (০.১৮ সেকেন্ড) 

আপনার সার্চ - q="Indian flag was drawn in BUET campus" site:altnews.in - কোনো পৃষ্ঠাতে পাওয়া যায়নি।

পরামর্শ:

  • দেখুন যে সব বানান ঠিক আছে কিনা
  • অন্য বিষয়বস্তু ব্যবহার করে চেষ্টা করুন
  • আরও সাধারণ বিষয়বস্তু দিয়ে চেষ্টা করুন
  • স্বল্প বিষয়বস্তু দিয়ে চেষ্টা করুন
Google অ্যাপ্লিকেশানগুলি
\ No newline at end of file diff --git a/search_response_bbc_com.html b/search_response_bbc_com.html new file mode 100644 index 0000000..4c6857e --- /dev/null +++ b/search_response_bbc_com.html @@ -0,0 +1,28 @@ +q="Indian flag BUET" site:bbc.com - Google Search

অ্যাক্সেসিবিলিটি লিঙ্ক

সরাসরি আসল কন্টেন্টে যানঅ্যাক্সেসিবিলিটির ক্ষেত্রে সহায়তা
অ্যাক্সেসিবিলিটির বিষয়ে মতামত
প্রায় ০টি ফলাফল (০.১৬ সেকেন্ড) 

আপনার সার্চ - q="Indian flag BUET" site:bbc.com - কোনো পৃষ্ঠাতে পাওয়া যায়নি।

পরামর্শ:

  • দেখুন যে সব বানান ঠিক আছে কিনা
  • অন্য বিষয়বস্তু ব্যবহার করে চেষ্টা করুন
  • আরও সাধারণ বিষয়বস্তু দিয়ে চেষ্টা করুন
  • স্বল্প বিষয়বস্তু দিয়ে চেষ্টা করুন
Google অ্যাপ্লিকেশানগুলি
\ No newline at end of file diff --git a/search_response_en_prothomalo_com.html b/search_response_en_prothomalo_com.html new file mode 100644 index 0000000..298364a --- /dev/null +++ b/search_response_en_prothomalo_com.html @@ -0,0 +1,28 @@ +q="flag BUET campus" site:en.prothomalo.com - Google Search

অ্যাক্সেসিবিলিটি লিঙ্ক

সরাসরি আসল কন্টেন্টে যানঅ্যাক্সেসিবিলিটির ক্ষেত্রে সহায়তা
অ্যাক্সেসিবিলিটির বিষয়ে মতামত
প্রায় ০টি ফলাফল (০.১৩ সেকেন্ড) 

আপনার সার্চ - q="flag BUET campus" site:en.prothomalo.com - কোনো পৃষ্ঠাতে পাওয়া যায়নি।

পরামর্শ:

  • দেখুন যে সব বানান ঠিক আছে কিনা
  • অন্য বিষয়বস্তু ব্যবহার করে চেষ্টা করুন
  • আরও সাধারণ বিষয়বস্তু দিয়ে চেষ্টা করুন
  • স্বল্প বিষয়বস্তু দিয়ে চেষ্টা করুন
Google অ্যাপ্লিকেশানগুলি
\ No newline at end of file