Merge branch 'dev' into 'master'
Dev See merge request planpostai/fact-checker-backend!1
This commit is contained in:
commit
8aae0d18da
18 changed files with 1263 additions and 267 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,4 +1,4 @@
|
|||
env
|
||||
.env
|
||||
test.py
|
||||
/__pycache__/
|
||||
__pycache__
|
||||
Binary file not shown.
Binary file not shown.
110
app/api/ai_fact_check.py
Normal file
110
app/api/ai_fact_check.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
from app.services.openai_client import OpenAIClient, AIFactChecker
|
||||
from app.config import OPENAI_API_KEY
|
||||
from app.models.ai_fact_check_models import (
|
||||
AIFactCheckRequest,
|
||||
AIFactCheckResponse,
|
||||
VerificationResult,
|
||||
TokenUsage,
|
||||
ErrorResponse
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
import asyncio
|
||||
|
||||
# Initialize router and OpenAI client
|
||||
aifact_check_router = APIRouter()
|
||||
openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
|
||||
fact_checker = AIFactChecker(openai_client=openai_client)
|
||||
|
||||
@aifact_check_router.post(
|
||||
"/aicheck-facts",
|
||||
response_model=AIFactCheckResponse,
|
||||
responses={
|
||||
400: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse}
|
||||
}
|
||||
)
|
||||
async def ai_fact_check(request: AIFactCheckRequest):
|
||||
"""
|
||||
Endpoint to fact-check a given statement based on multiple webpage URLs.
|
||||
Input:
|
||||
- urls: List of webpage URLs to analyze (with or without http/https)
|
||||
- content: The fact statement to verify
|
||||
Response:
|
||||
- JSON response with verification results per URL, sources, and token usage
|
||||
"""
|
||||
try:
|
||||
results = {}
|
||||
all_sources = set()
|
||||
all_contexts = []
|
||||
total_prompt_tokens = 0
|
||||
total_completion_tokens = 0
|
||||
total_tokens = 0
|
||||
|
||||
# Process all URLs concurrently
|
||||
tasks = [
|
||||
fact_checker.check_fact(url=url, query=request.content)
|
||||
for url in request.urls
|
||||
]
|
||||
fact_check_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for url, result in zip(request.urls, fact_check_results):
|
||||
if isinstance(result, Exception):
|
||||
# Handle failed URL checks
|
||||
results[url] = VerificationResult(
|
||||
verdict="Error",
|
||||
confidence="Low",
|
||||
evidence=f"Error checking URL: {str(result)}",
|
||||
reasoning="URL processing failed",
|
||||
missing_info="Could not access or process the URL"
|
||||
)
|
||||
continue
|
||||
|
||||
verification_result = VerificationResult(
|
||||
verdict=result["verification_result"]["verdict"],
|
||||
confidence=result["verification_result"]["confidence"],
|
||||
evidence=result["verification_result"]["evidence"],
|
||||
reasoning=result["verification_result"]["reasoning"],
|
||||
missing_info=result["verification_result"].get("missing_info", None)
|
||||
)
|
||||
|
||||
results[url] = verification_result
|
||||
all_sources.update(result["sources"])
|
||||
|
||||
# Accumulate token usage
|
||||
total_prompt_tokens += result["token_usage"]["prompt_tokens"]
|
||||
total_completion_tokens += result["token_usage"]["completion_tokens"]
|
||||
total_tokens += result["token_usage"]["total_tokens"]
|
||||
|
||||
token_usage = TokenUsage(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=total_completion_tokens,
|
||||
total_tokens=total_tokens
|
||||
)
|
||||
|
||||
return AIFactCheckResponse(
|
||||
query=request.content,
|
||||
verification_result=results,
|
||||
sources=list(all_sources),
|
||||
token_usage=token_usage
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=ErrorResponse(
|
||||
detail=str(e),
|
||||
error_code="INVALID_URL",
|
||||
path="/aicheck-facts"
|
||||
).dict()
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail=f"Error processing fact-check request: {str(e)}",
|
||||
error_code="PROCESSING_ERROR",
|
||||
path="/aicheck-facts"
|
||||
).dict()
|
||||
)
|
||||
|
|
@ -1,291 +1,192 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
|
||||
from typing import Dict, List, Optional, Union
|
||||
import requests
|
||||
from enum import Enum
|
||||
from datetime import datetime
|
||||
import json
|
||||
from app.config import GOOGLE_FACT_CHECK_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
|
||||
import httpx
|
||||
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
|
||||
from app.api.scrap_websites import search_websites, SearchRequest
|
||||
from app.services.openai_client import OpenAIClient
|
||||
from app.models.fact_check_models import (
|
||||
FactCheckRequest,
|
||||
FactCheckResponse,
|
||||
ErrorResponse,
|
||||
Source
|
||||
)
|
||||
from app.websites.fact_checker_website import get_all_sources
|
||||
|
||||
fact_check_router = APIRouter()
|
||||
openai_client = OpenAIClient(OPENAI_API_KEY)
|
||||
|
||||
class CustomJSONEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
return super().default(obj)
|
||||
async def generate_fact_report(query: str, fact_check_data: dict) -> FactCheckResponse:
|
||||
"""Generate a fact check report using OpenAI based on the fact check results."""
|
||||
try:
|
||||
base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
detail: str
|
||||
error_code: str = Field(..., description="Unique error code for this type of error")
|
||||
timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
|
||||
path: Optional[str] = Field(None, description="The endpoint path where error occurred")
|
||||
Rules:
|
||||
1. Include all source URLs and names in the sources list
|
||||
2. Keep the explanation focused on verifiable facts
|
||||
3. Include dates when available
|
||||
4. Maintain objectivity in the report"""
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"detail": "Error description",
|
||||
"error_code": "ERROR_CODE",
|
||||
"timestamp": "2024-12-09T16:49:30.905765",
|
||||
"path": "/check-facts"
|
||||
base_user_prompt = """Generate a comprehensive fact check report in this exact JSON format:
|
||||
{
|
||||
"claim": "Write the exact claim being verified",
|
||||
"verdict": "One of: True/False/Partially True/Unverified",
|
||||
"confidence": "One of: High/Medium/Low",
|
||||
"sources": [
|
||||
{
|
||||
"url": "Full URL of the source",
|
||||
"name": "Name of the source organization"
|
||||
}
|
||||
})
|
||||
|
||||
class RequestValidationError(BaseModel):
|
||||
loc: List[str]
|
||||
msg: str
|
||||
type: str
|
||||
|
||||
class Publisher(BaseModel):
|
||||
name: str
|
||||
site: Optional[str] = Field(None, description="Publisher's website")
|
||||
|
||||
@validator('site')
|
||||
def validate_site(cls, v):
|
||||
if v and not (v.startswith('http://') or v.startswith('https://')):
|
||||
return f"https://{v}"
|
||||
return v
|
||||
|
||||
class ClaimReview(BaseModel):
|
||||
publisher: Publisher
|
||||
url: Optional[HttpUrl] = None
|
||||
title: Optional[str] = None
|
||||
reviewDate: Optional[str] = None
|
||||
textualRating: Optional[str] = None
|
||||
languageCode: str = Field(default="en-US")
|
||||
|
||||
class Claim(BaseModel):
|
||||
text: str
|
||||
claimant: Optional[str] = None
|
||||
claimDate: Optional[str] = None
|
||||
claimReview: List[ClaimReview]
|
||||
|
||||
class FactCheckResponse(BaseModel):
|
||||
query: str = Field(..., description="Original query that was fact-checked")
|
||||
total_claims_found: int = Field(..., ge=0)
|
||||
results: List[Claim] = Field(default_factory=list)
|
||||
summary: Dict[str, int] = Field(...)
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"query": "Example claim",
|
||||
"total_claims_found": 1,
|
||||
"results": [{
|
||||
"text": "Example claim text",
|
||||
"claimant": "Source name",
|
||||
"claimReview": [{
|
||||
"publisher": {
|
||||
"name": "Fact Checker",
|
||||
"site": "factchecker.com"
|
||||
},
|
||||
"textualRating": "True"
|
||||
}]
|
||||
}],
|
||||
"summary": {
|
||||
"total_sources": 1,
|
||||
"fact_checking_sites_queried": 10
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
class SourceType(str, Enum):
|
||||
FACT_CHECKER = "fact_checker"
|
||||
NEWS_SITE = "news_site"
|
||||
|
||||
class FactCheckSource(BaseModel):
|
||||
domain: str
|
||||
type: SourceType
|
||||
priority: int = Field(default=1, ge=1, le=10)
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"domain": "factcheck.org",
|
||||
"type": "fact_checker",
|
||||
"priority": 1
|
||||
}
|
||||
})
|
||||
|
||||
# Sources configuration with validation
|
||||
SOURCES = {
|
||||
"fact_checkers": [
|
||||
FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
|
||||
for domain in [
|
||||
"factcheck.org",
|
||||
"snopes.com",
|
||||
"politifact.com",
|
||||
"reuters.com",
|
||||
"bbc.com",
|
||||
"apnews.com",
|
||||
"usatoday.com",
|
||||
"nytimes.com",
|
||||
"washingtonpost.com",
|
||||
"afp.com",
|
||||
"fullfact.org",
|
||||
"truthorfiction.com",
|
||||
"leadstories.com",
|
||||
"altnews.in",
|
||||
"boomlive.in",
|
||||
"en.prothomalo.com"
|
||||
]
|
||||
],
|
||||
"news_sites": [
|
||||
FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
|
||||
for domain in [
|
||||
"www.thedailystar.net",
|
||||
"www.thefinancialexpress.com.bd",
|
||||
"www.theindependentbd.com",
|
||||
"www.dhakatribune.com",
|
||||
"www.newagebd.net",
|
||||
"www.observerbd.com",
|
||||
"www.daily-sun.com",
|
||||
"www.tbsnews.net",
|
||||
"www.businesspostbd.com",
|
||||
"www.banglanews24.com/english",
|
||||
"www.bdnews24.com/english",
|
||||
"www.risingbd.com/english",
|
||||
"www.dailyindustry.news",
|
||||
"www.bangladeshpost.net",
|
||||
"www.daily-bangladesh.com/english"
|
||||
]
|
||||
]
|
||||
"evidence": "A concise summary of the key evidence (1-2 sentences)",
|
||||
"explanation": "A detailed explanation including who verified it, when it was verified, and the key findings (2-3 sentences)",
|
||||
"additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
|
||||
}
|
||||
|
||||
class FactCheckRequest(BaseModel):
|
||||
content: str = Field(
|
||||
...,
|
||||
min_length=10,
|
||||
max_length=1000,
|
||||
description="The claim to be fact-checked"
|
||||
)
|
||||
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
|
||||
max_results_per_source: int = Field(default=10, ge=1, le=50)
|
||||
Ensure all URLs in sources are complete (including https:// if missing) and each source has both a URL and name."""
|
||||
|
||||
@validator('content')
|
||||
def validate_content(cls, v):
|
||||
if not v.strip():
|
||||
raise ValueError("Content cannot be empty or just whitespace")
|
||||
return v.strip()
|
||||
if "claims" in fact_check_data:
|
||||
system_prompt = base_system_prompt
|
||||
user_prompt = f"""Query: {query}
|
||||
Fact Check Results: {fact_check_data}
|
||||
|
||||
{base_user_prompt}
|
||||
|
||||
async def fetch_fact_checks(
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
query: str,
|
||||
site: FactCheckSource
|
||||
) -> Dict:
|
||||
"""
|
||||
Fetch fact checks from a specific site using the Google Fact Check API
|
||||
"""
|
||||
try:
|
||||
if not api_key or not base_url:
|
||||
raise ValueError("API key or base URL not configured")
|
||||
The report should:
|
||||
1. Include ALL source URLs and organization names
|
||||
2. Specify verification dates when available
|
||||
3. Name the fact-checking organizations involved
|
||||
4. Describe the verification process"""
|
||||
|
||||
else:
|
||||
system_prompt = base_system_prompt
|
||||
user_prompt = f"""Query: {query}
|
||||
Fact Check Results: {fact_check_data}
|
||||
|
||||
{base_user_prompt}
|
||||
|
||||
params = {
|
||||
"key": api_key,
|
||||
"query": query,
|
||||
"languageCode": "en-US",
|
||||
"reviewPublisherSiteFilter": site.domain,
|
||||
"pageSize": 10
|
||||
}
|
||||
The report should:
|
||||
1. Include ALL source URLs and names from both verification_result and sources fields
|
||||
2. Mention all fact-checking organizations involved
|
||||
3. Describe the verification process
|
||||
4. Note any conflicting information between sources"""
|
||||
|
||||
response = await openai_client.generate_text_response(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
max_tokens=1000
|
||||
)
|
||||
|
||||
response = requests.get(base_url, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.RequestException as e:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail=ErrorResponse(
|
||||
detail=f"Error fetching from {site.domain}: {str(e)}",
|
||||
error_code="FACT_CHECK_SERVICE_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
except ValueError as e:
|
||||
try:
|
||||
# First try to parse the response directly
|
||||
response_data = response["response"]
|
||||
|
||||
# Clean up sources before validation
|
||||
if isinstance(response_data.get('sources'), list):
|
||||
cleaned_sources = []
|
||||
for source in response_data['sources']:
|
||||
if isinstance(source, str):
|
||||
# Convert string sources to Source objects
|
||||
url = source if source.startswith('http') else f"https://{source}"
|
||||
cleaned_sources.append({
|
||||
"url": url,
|
||||
"name": source
|
||||
})
|
||||
elif isinstance(source, dict):
|
||||
# Ensure URL has proper scheme
|
||||
url = source.get('url', '')
|
||||
if url and not url.startswith('http'):
|
||||
source['url'] = f"https://{url}"
|
||||
cleaned_sources.append(source)
|
||||
response_data['sources'] = cleaned_sources
|
||||
|
||||
fact_check_response = FactCheckResponse(**response_data)
|
||||
return fact_check_response
|
||||
|
||||
except Exception as validation_error:
|
||||
print(f"Response validation error: {str(validation_error)}")
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=ErrorResponse(
|
||||
detail=f"Invalid response format: {str(validation_error)}",
|
||||
error_code="VALIDATION_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating fact report: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail=str(e),
|
||||
detail="Error generating fact report",
|
||||
error_code="FACT_CHECK_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
@fact_check_router.post("/check-facts", response_model=FactCheckResponse)
|
||||
async def check_facts(request: FactCheckRequest):
|
||||
"""
|
||||
Fetch fact check results and generate a comprehensive report.
|
||||
"""
|
||||
if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail="Google API key or base URL is not configured",
|
||||
error_code="CONFIGURATION_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
@fact_check_router.post(
|
||||
"/check-facts",
|
||||
response_model=FactCheckResponse,
|
||||
responses={
|
||||
400: {"model": ErrorResponse},
|
||||
404: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse},
|
||||
503: {"model": ErrorResponse}
|
||||
}
|
||||
)
|
||||
async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
||||
"""
|
||||
Check facts using multiple fact-checking sources
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
# Validate configuration
|
||||
if not GOOGLE_FACT_CHECK_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail="API configuration is missing",
|
||||
error_code="CONFIGURATION_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
# Check all sources in priority order
|
||||
all_sources = (
|
||||
SOURCES["fact_checkers"] +
|
||||
SOURCES["news_sites"]
|
||||
)
|
||||
all_sources.sort(key=lambda x: x.priority)
|
||||
|
||||
for source in all_sources:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Get fact checker sources from the centralized configuration
|
||||
fact_checker_sources = get_all_sources()
|
||||
|
||||
for source in fact_checker_sources:
|
||||
params = {
|
||||
"key": GOOGLE_API_KEY,
|
||||
"query": request.query,
|
||||
"languageCode": "en-US",
|
||||
"reviewPublisherSiteFilter": source.domain,
|
||||
"pageSize": 10
|
||||
}
|
||||
|
||||
try:
|
||||
response = await client.get(
|
||||
GOOGLE_FACT_CHECK_BASE_URL,
|
||||
params=params,
|
||||
headers=headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
json_response = response.json()
|
||||
|
||||
if json_response.get("claims"):
|
||||
return await generate_fact_report(request.query, json_response)
|
||||
|
||||
except httpx.RequestError as e:
|
||||
print(f"Error fetching results for site {source.domain}: {str(e)}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Unexpected error for site {source.domain}: {str(e)}")
|
||||
continue
|
||||
|
||||
try:
|
||||
result = await fetch_fact_checks(
|
||||
GOOGLE_FACT_CHECK_API_KEY,
|
||||
GOOGLE_FACT_CHECK_BASE_URL,
|
||||
request.content,
|
||||
source
|
||||
search_request = SearchRequest(
|
||||
search_text=request.query,
|
||||
source_types=["fact_checkers"]
|
||||
)
|
||||
|
||||
if "claims" in result:
|
||||
# Validate each claim through Pydantic
|
||||
validated_claims = [
|
||||
Claim(**claim).dict()
|
||||
for claim in result["claims"]
|
||||
]
|
||||
all_results.extend(validated_claims)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
ai_response = await search_websites(search_request)
|
||||
return await generate_fact_report(request.query, ai_response)
|
||||
|
||||
except Exception as e:
|
||||
# Log the error but continue with other sources
|
||||
print(f"Error processing {source.domain}: {str(e)}")
|
||||
continue
|
||||
|
||||
if not all_results:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=ErrorResponse(
|
||||
detail="No fact check results found",
|
||||
error_code="NO_RESULTS_FOUND",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
# Create the response using Pydantic model
|
||||
response = FactCheckResponse(
|
||||
query=request.content,
|
||||
total_claims_found=len(all_results),
|
||||
results=all_results,
|
||||
summary={
|
||||
"total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "")
|
||||
for claim in all_results if claim.get("claimReview"))),
|
||||
"fact_checking_sites_queried": len(all_sources)
|
||||
}
|
||||
)
|
||||
|
||||
return response
|
||||
print(f"Error in AI fact check: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=ErrorResponse(
|
||||
detail="No fact check results found",
|
||||
error_code="NOT_FOUND",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
160
app/api/scrap_websites.py
Normal file
160
app/api/scrap_websites.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
import httpx
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
from typing import List, Dict, Optional
|
||||
from pydantic import BaseModel
|
||||
from app.models.ai_fact_check_models import (
|
||||
AIFactCheckRequest,
|
||||
FactCheckSource,
|
||||
SourceType
|
||||
)
|
||||
from app.websites.fact_checker_website import SOURCES, get_all_sources
|
||||
from app.api.ai_fact_check import ai_fact_check
|
||||
from app.config import GOOGLE_API_KEY, GOOGLE_ENGINE_ID, GOOGLE_SEARCH_URL
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
search_text: str
|
||||
source_types: List[str] = ["fact_checkers"]
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
scrap_websites_router = APIRouter()
|
||||
|
||||
# Constants
|
||||
RESULTS_PER_PAGE = 10
|
||||
MAX_PAGES = 5
|
||||
MAX_URLS_PER_DOMAIN = 5
|
||||
|
||||
|
||||
def get_domain_from_url(url: str) -> str:
|
||||
"""Extract domain from URL with improved handling."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
return domain
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting domain from URL {url}: {str(e)}")
|
||||
return ""
|
||||
|
||||
def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
|
||||
"""Check if domain matches any source with improved matching logic."""
|
||||
if not domain:
|
||||
return False
|
||||
|
||||
domain = domain.lower()
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
for source in sources:
|
||||
source_domain = source.domain.lower()
|
||||
if source_domain.startswith('www.'):
|
||||
source_domain = source_domain[4:]
|
||||
|
||||
if domain == source_domain or domain.endswith('.' + source_domain):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
|
||||
"""Build search query with site restrictions."""
|
||||
site_queries = [f"site:{source.domain}" for source in sources]
|
||||
site_restriction = " OR ".join(site_queries)
|
||||
return f"({query}) ({site_restriction})"
|
||||
|
||||
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
|
||||
"""Perform Google Custom Search with enhanced query."""
|
||||
enhanced_query = await build_enhanced_search_query(query, sources)
|
||||
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
|
||||
|
||||
params = {
|
||||
"key": GOOGLE_API_KEY,
|
||||
"cx": GOOGLE_ENGINE_ID,
|
||||
"q": enhanced_query,
|
||||
"num": RESULTS_PER_PAGE,
|
||||
"start": start_index
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
try:
|
||||
response = await client.get(GOOGLE_SEARCH_URL, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Search error: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
|
||||
|
||||
@scrap_websites_router.post("/search")
|
||||
async def search_websites(request: SearchRequest):
|
||||
# Get the source types from the request
|
||||
source_types = request.source_types if request.source_types else ["fact_checkers"]
|
||||
|
||||
# Get sources based on requested types
|
||||
selected_sources = []
|
||||
for source_type in source_types:
|
||||
if source_type in SOURCES:
|
||||
selected_sources.extend(SOURCES[source_type])
|
||||
|
||||
# If no valid sources found, use fact checkers as default
|
||||
if not selected_sources:
|
||||
selected_sources = SOURCES["fact_checkers"]
|
||||
|
||||
all_urls = []
|
||||
domain_results = {}
|
||||
|
||||
try:
|
||||
for page in range(1, MAX_PAGES + 1):
|
||||
if len(all_urls) >= 50:
|
||||
break
|
||||
|
||||
search_response = await google_custom_search(request.search_text, selected_sources, page)
|
||||
|
||||
if not search_response or not search_response.get("items"):
|
||||
break
|
||||
|
||||
for item in search_response.get("items", []):
|
||||
url = item.get("link")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
domain = get_domain_from_url(url)
|
||||
|
||||
if is_valid_source_domain(domain, selected_sources):
|
||||
if domain not in domain_results:
|
||||
domain_results[domain] = []
|
||||
|
||||
if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
|
||||
domain_results[domain].append({
|
||||
"url": url,
|
||||
"title": item.get("title", ""),
|
||||
"snippet": item.get("snippet", "")
|
||||
})
|
||||
all_urls.append(url)
|
||||
|
||||
if len(all_urls) >= 50:
|
||||
break
|
||||
|
||||
if not all_urls:
|
||||
return {
|
||||
"status": "no_results",
|
||||
"urls_found": 0
|
||||
}
|
||||
|
||||
fact_check_request = AIFactCheckRequest(
|
||||
content=request.search_text,
|
||||
urls=all_urls[:5]
|
||||
)
|
||||
|
||||
return await ai_fact_check(fact_check_request)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during search/fact-check process: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
|
@ -3,8 +3,10 @@ from dotenv import load_dotenv
|
|||
|
||||
load_dotenv()
|
||||
|
||||
GOOGLE_FACT_CHECK_API_KEY = os.environ["GOOGLE_FACT_CHECK_API_KEY"]
|
||||
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
|
||||
GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
|
||||
GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
|
||||
GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"]
|
||||
|
||||
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
||||
FRONTEND_URL = os.environ["FRONTEND_URL"]
|
||||
BIN
app/models/__pycache__/fact_check_models.cpython-312.pyc
Normal file
BIN
app/models/__pycache__/fact_check_models.cpython-312.pyc
Normal file
Binary file not shown.
229
app/models/ai_fact_check_models.py
Normal file
229
app/models/ai_fact_check_models.py
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Common Models
|
||||
class TokenUsage(BaseModel):
|
||||
prompt_tokens: Optional[int] = 0
|
||||
completion_tokens: Optional[int] = 0
|
||||
total_tokens: Optional[int] = 0
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
detail: str
|
||||
error_code: str = Field(..., description="Unique error code for this type of error")
|
||||
timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
|
||||
path: Optional[str] = Field(None, description="The endpoint path where error occurred")
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"detail": "Error description",
|
||||
"error_code": "ERROR_CODE",
|
||||
"timestamp": "2024-12-09T16:49:30.905765",
|
||||
"path": "/check-facts"
|
||||
}
|
||||
})
|
||||
|
||||
# Fact Check Models
|
||||
class Publisher(BaseModel):
|
||||
name: str
|
||||
site: Optional[str] = Field(None, description="Publisher's website")
|
||||
|
||||
@validator('site')
|
||||
def validate_site(cls, v):
|
||||
if v and not (v.startswith('http://') or v.startswith('https://')):
|
||||
return f"https://{v}"
|
||||
return v
|
||||
|
||||
class ClaimReview(BaseModel):
|
||||
publisher: Publisher
|
||||
url: Optional[HttpUrl] = None
|
||||
title: Optional[str] = None
|
||||
reviewDate: Optional[str] = None
|
||||
textualRating: Optional[str] = None
|
||||
languageCode: str = Field(default="en-US")
|
||||
|
||||
class Claim(BaseModel):
|
||||
text: str
|
||||
claimant: Optional[str] = None
|
||||
claimDate: Optional[str] = None
|
||||
claimReview: List[ClaimReview]
|
||||
|
||||
class SourceType(str, Enum):
|
||||
FACT_CHECKER = "fact_checker"
|
||||
NEWS_SITE = "news_site"
|
||||
|
||||
class FactCheckSource(BaseModel):
|
||||
domain: str
|
||||
type: SourceType
|
||||
priority: int = Field(default=1, ge=1, le=10)
|
||||
|
||||
# Verification Models
|
||||
class VerificationResult(BaseModel):
|
||||
verdict: str = Field(..., description="True/False/Insufficient Information")
|
||||
confidence: str = Field(..., description="High/Medium/Low")
|
||||
evidence: Union[str, List[str]]
|
||||
reasoning: str
|
||||
missing_info: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Direct quote from source supporting the claim"],
|
||||
"reasoning": "Detailed analysis of why the claim is considered true",
|
||||
"missing_info": "Any caveats or limitations of the verification"
|
||||
}
|
||||
})
|
||||
|
||||
# Request Models
|
||||
class BaseFactCheckRequest(BaseModel):
|
||||
content: str = Field(
|
||||
...,
|
||||
min_length=10,
|
||||
max_length=1000,
|
||||
description="The claim to be fact-checked"
|
||||
)
|
||||
|
||||
@validator('content')
|
||||
def validate_content(cls, v):
|
||||
if not v.strip():
|
||||
raise ValueError("Content cannot be empty or just whitespace")
|
||||
return v.strip()
|
||||
|
||||
class GoogleFactCheckRequest(BaseFactCheckRequest):
|
||||
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
|
||||
max_results_per_source: int = Field(default=10, ge=1, le=50)
|
||||
|
||||
class AIFactCheckRequest(BaseFactCheckRequest):
|
||||
urls: List[str] = Field(
|
||||
...,
|
||||
min_items=1,
|
||||
max_items=5,
|
||||
description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
|
||||
)
|
||||
|
||||
@validator('urls')
|
||||
def validate_urls(cls, urls):
|
||||
validated_urls = []
|
||||
for url in urls:
|
||||
if not url.strip():
|
||||
raise ValueError("URL cannot be empty")
|
||||
|
||||
# Add https:// if no protocol specified
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = f'https://{url}'
|
||||
|
||||
try:
|
||||
result = urlparse(url)
|
||||
if not result.netloc:
|
||||
raise ValueError(f"Invalid URL structure for {url}")
|
||||
validated_urls.append(url)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid URL {url}: {str(e)}")
|
||||
|
||||
return validated_urls
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"content": "Indian flag was drawn in BUET campus",
|
||||
"urls": [
|
||||
"www.altnews.in/article-about-flag",
|
||||
"www.another-source.com/related-news"
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
# Response Models
|
||||
class BaseFactCheckResponse(BaseModel):
|
||||
query: str
|
||||
token_usage: TokenUsage
|
||||
sources: List[str]
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"query": "Example statement to verify",
|
||||
"token_usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
},
|
||||
"sources": ["source1.com", "source2.com"],
|
||||
}
|
||||
})
|
||||
|
||||
class GoogleFactCheckResponse(BaseFactCheckResponse):
|
||||
total_claims_found: int
|
||||
results: List[Dict[str, Any]]
|
||||
verification_result: Dict[str, Any]
|
||||
summary: Dict[str, int]
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"query": "Example claim",
|
||||
"total_claims_found": 1,
|
||||
"results": [{
|
||||
"text": "Example claim text",
|
||||
"claimant": "Source name",
|
||||
"claimReview": [{
|
||||
"publisher": {
|
||||
"name": "Fact Checker",
|
||||
"site": "factchecker.com"
|
||||
},
|
||||
"textualRating": "True"
|
||||
}]
|
||||
}],
|
||||
"verification_result": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Supporting evidence"],
|
||||
"reasoning": "Detailed analysis"
|
||||
},
|
||||
"sources": ["factchecker.com"],
|
||||
"token_usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
},
|
||||
"summary": {
|
||||
"total_sources": 1,
|
||||
"fact_checking_sites_queried": 10
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
class AIFactCheckResponse(BaseFactCheckResponse):
|
||||
verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"query": "Indian flag was drawn in BUET campus",
|
||||
"verification_result": {
|
||||
"https://www.source1.com": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Supporting evidence from source 1"],
|
||||
"reasoning": "Detailed analysis from source 1",
|
||||
"missing_info": None
|
||||
},
|
||||
"https://www.source2.com": {
|
||||
"verdict": "True",
|
||||
"confidence": "Medium",
|
||||
"evidence": ["Supporting evidence from source 2"],
|
||||
"reasoning": "Analysis from source 2",
|
||||
"missing_info": "Additional context needed"
|
||||
}
|
||||
},
|
||||
"sources": ["source1.com", "source2.com"],
|
||||
"token_usage": {
|
||||
"prompt_tokens": 200,
|
||||
"completion_tokens": 100,
|
||||
"total_tokens": 300
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
# Backwards compatibility aliases
|
||||
FactCheckRequest = GoogleFactCheckRequest
|
||||
FactCheckResponse = GoogleFactCheckResponse
|
||||
101
app/models/fact_check_models.py
Normal file
101
app/models/fact_check_models.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
from pydantic import BaseModel, Field, HttpUrl, validator
|
||||
from typing import List, Literal, Union
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
class VerdictEnum(str, Enum):
|
||||
TRUE = "True"
|
||||
FALSE = "False"
|
||||
PARTIALLY_TRUE = "Partially True"
|
||||
UNVERIFIED = "Unverified"
|
||||
|
||||
class ConfidenceEnum(str, Enum):
|
||||
HIGH = "High"
|
||||
MEDIUM = "Medium"
|
||||
LOW = "Low"
|
||||
|
||||
class FactCheckRequest(BaseModel):
|
||||
query: str = Field(
|
||||
...,
|
||||
min_length=3,
|
||||
max_length=500,
|
||||
description="The claim or statement to be fact-checked",
|
||||
example="Did NASA confirm finding alien structures on Mars in 2024?"
|
||||
)
|
||||
|
||||
class Source(BaseModel):
|
||||
url: str
|
||||
name: str = ""
|
||||
|
||||
@validator('url')
|
||||
def validate_url(cls, v):
|
||||
# Basic URL validation without requiring HTTP/HTTPS
|
||||
if not v or len(v) < 3:
|
||||
raise ValueError("URL must not be empty and must be at least 3 characters")
|
||||
return v
|
||||
|
||||
class FactCheckResponse(BaseModel):
|
||||
claim: str = Field(
|
||||
...,
|
||||
min_length=10,
|
||||
max_length=1000,
|
||||
description="The exact claim being verified"
|
||||
)
|
||||
verdict: VerdictEnum = Field(
|
||||
...,
|
||||
description="The verification verdict"
|
||||
)
|
||||
confidence: ConfidenceEnum = Field(
|
||||
...,
|
||||
description="Confidence level in the verdict"
|
||||
)
|
||||
sources: List[Source] = Field(
|
||||
...,
|
||||
min_items=1,
|
||||
description="List of sources used in verification"
|
||||
)
|
||||
evidence: str = Field(
|
||||
...,
|
||||
min_length=20,
|
||||
max_length=500,
|
||||
description="Concise summary of key evidence"
|
||||
)
|
||||
explanation: str = Field(
|
||||
...,
|
||||
min_length=50,
|
||||
max_length=1000,
|
||||
description="Detailed explanation of verification findings"
|
||||
)
|
||||
additional_context: str = Field(
|
||||
...,
|
||||
min_length=20,
|
||||
max_length=500,
|
||||
description="Important context about the verification"
|
||||
)
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"claim": "NASA confirmed finding alien structures on Mars in 2024",
|
||||
"verdict": "False",
|
||||
"confidence": "High",
|
||||
"sources": [
|
||||
{
|
||||
"url": "https://www.nasa.gov/mars-exploration",
|
||||
"name": "NASA Mars Exploration"
|
||||
},
|
||||
{
|
||||
"url": "https://factcheck.org/2024/mars-claims",
|
||||
"name": "FactCheck.org"
|
||||
}
|
||||
],
|
||||
"evidence": "NASA has made no such announcement. Recent Mars rover images show natural rock formations.",
|
||||
"explanation": "Multiple fact-checking organizations investigated this claim. NASA's official communications and Mars mission reports from 2024 contain no mention of alien structures. The viral images being shared are misidentified natural geological formations.",
|
||||
"additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images."
|
||||
}
|
||||
}
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
detail: str
|
||||
error_code: str = Field(..., example="VALIDATION_ERROR")
|
||||
path: str = Field(..., example="/check-facts")
|
||||
43
app/models/scrap_websites_models.py
Normal file
43
app/models/scrap_websites_models.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
from pydantic import BaseModel
|
||||
from typing import List, Dict
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
search_text: str
|
||||
source_types: List[str] = ["fact_checkers"]
|
||||
|
||||
class Publisher(BaseModel):
|
||||
name: str
|
||||
site: str
|
||||
|
||||
class ClaimReview(BaseModel):
|
||||
publisher: Publisher
|
||||
textualRating: str
|
||||
|
||||
class Claim(BaseModel):
|
||||
claimReview: List[ClaimReview]
|
||||
claimant: str
|
||||
text: str
|
||||
|
||||
class Summary(BaseModel):
|
||||
fact_checking_sites_queried: int
|
||||
total_sources: int
|
||||
|
||||
class TokenUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
class VerificationResult(BaseModel):
|
||||
verdict: str
|
||||
confidence: str
|
||||
evidence: List[str]
|
||||
reasoning: str
|
||||
|
||||
class EnhancedFactCheckResponse(BaseModel):
|
||||
query: str
|
||||
results: List[Claim]
|
||||
sources: List[str]
|
||||
summary: Summary
|
||||
token_usage: Dict[str, int]
|
||||
total_claims_found: int
|
||||
verification_result: VerificationResult
|
||||
172
app/services/openai_client.py
Normal file
172
app/services/openai_client.py
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.documents import Document
|
||||
from typing import List, Dict, Any
|
||||
import numpy as np
|
||||
import logging as logger
|
||||
import openai
|
||||
import json
|
||||
|
||||
class OpenAIClient:
|
||||
def __init__(self, api_key: str):
|
||||
"""
|
||||
Initialize OpenAI client with the provided API key.
|
||||
"""
|
||||
openai.api_key = api_key
|
||||
|
||||
async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict:
|
||||
"""
|
||||
Generate a response using OpenAI's chat completion API.
|
||||
"""
|
||||
try:
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
content = response['choices'][0]['message']['content']
|
||||
# Parse the JSON string into a dictionary
|
||||
parsed_content = json.loads(content)
|
||||
|
||||
return {
|
||||
"response": parsed_content, # Now returns a dictionary instead of string
|
||||
"prompt_tokens": response['usage']['prompt_tokens'],
|
||||
"completion_tokens": response['usage']['completion_tokens'],
|
||||
"total_tokens": response['usage']['total_tokens']
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI text generation error: {str(e)}")
|
||||
|
||||
def get_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Retrieve embeddings for a list of texts using OpenAI's embedding API.
|
||||
"""
|
||||
try:
|
||||
response = openai.Embedding.create(
|
||||
input=texts,
|
||||
model="text-embedding-ada-002"
|
||||
)
|
||||
embeddings = [data['embedding'] for data in response['data']]
|
||||
return embeddings
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding error: {str(e)}")
|
||||
|
||||
class AIFactChecker:
|
||||
def __init__(self, openai_client: OpenAIClient):
|
||||
"""Initialize the fact checker with OpenAI client."""
|
||||
self.openai_client = openai_client
|
||||
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
||||
)
|
||||
|
||||
async def scrape_webpage(self, url: str) -> List[Document]:
|
||||
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
|
||||
try:
|
||||
loader = AsyncHtmlLoader([url])
|
||||
docs = await loader.aload()
|
||||
|
||||
bs_transformer = BeautifulSoupTransformer()
|
||||
docs_transformed = bs_transformer.transform_documents(docs)
|
||||
docs_chunks = self.text_splitter.split_documents(docs_transformed)
|
||||
|
||||
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
|
||||
return docs_chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
|
||||
raise
|
||||
|
||||
def find_relevant_chunks(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
doc_embeddings: List[List[float]],
|
||||
docs: List[Document]
|
||||
) -> List[Document]:
|
||||
"""Find most relevant document chunks using cosine similarity."""
|
||||
try:
|
||||
query_array = np.array(query_embedding)
|
||||
chunks_array = np.array(doc_embeddings)
|
||||
|
||||
similarities = np.dot(chunks_array, query_array) / (
|
||||
np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array)
|
||||
)
|
||||
|
||||
top_indices = np.argsort(similarities)[-5:][::-1]
|
||||
return [docs[i] for i in top_indices]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error finding relevant chunks | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]:
|
||||
"""Verify fact using OpenAI's API with context from relevant documents."""
|
||||
try:
|
||||
context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
||||
|
||||
system_prompt = """You are a professional fact-checking assistant. Analyze the provided context
|
||||
and determine if the given statement is true, false, or if there isn't enough information.
|
||||
|
||||
Provide your response in the following JSON format:
|
||||
{
|
||||
"verdict": "True/False/Insufficient Information",
|
||||
"confidence": "High/Medium/Low",
|
||||
"evidence": "Direct quotes or evidence from the context",
|
||||
"reasoning": "Your detailed analysis and reasoning",
|
||||
"missing_info": "Any important missing information (if applicable)"
|
||||
}"""
|
||||
|
||||
user_prompt = f"""Context:
|
||||
{context}
|
||||
|
||||
Statement to verify: "{query}"
|
||||
|
||||
Analyze the statement based on the provided context and return your response in the specified JSON format."""
|
||||
|
||||
response = await self.openai_client.generate_text_response(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
max_tokens=800
|
||||
)
|
||||
|
||||
sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs]))
|
||||
|
||||
return {
|
||||
"verification_result": response["response"], # This is now a dictionary
|
||||
"sources": sources,
|
||||
"token_usage": {
|
||||
"prompt_tokens": response["prompt_tokens"],
|
||||
"completion_tokens": response["completion_tokens"],
|
||||
"total_tokens": response["total_tokens"]
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error verifying fact | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def check_fact(self, url: str, query: str) -> Dict[str, Any]:
|
||||
"""Main method to check a fact against a webpage."""
|
||||
try:
|
||||
docs = await self.scrape_webpage(url)
|
||||
|
||||
doc_texts = [doc.page_content for doc in docs]
|
||||
doc_embeddings = self.openai_client.get_embeddings(doc_texts)
|
||||
query_embedding = self.openai_client.get_embeddings([query])
|
||||
|
||||
relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs)
|
||||
verification_result = await self.verify_fact(query, relevant_docs)
|
||||
|
||||
return verification_result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking fact | error={str(e)}")
|
||||
raise
|
||||
BIN
app/websites/__pycache__/fact_checker_website.cpython-312.pyc
Normal file
BIN
app/websites/__pycache__/fact_checker_website.cpython-312.pyc
Normal file
Binary file not shown.
190
app/websites/fact_checker_website.py
Normal file
190
app/websites/fact_checker_website.py
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
from typing import Dict, List
|
||||
import requests
|
||||
from fastapi import HTTPException
|
||||
from app.models.ai_fact_check_models import FactCheckSource, ErrorResponse, FactCheckRequest, SourceType
|
||||
|
||||
# Sources configuration with validation
|
||||
SOURCES = {
|
||||
"fact_checkers": [
|
||||
FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
|
||||
for domain in [
|
||||
"snopes.com",
|
||||
"politifact.com",
|
||||
"factcheck.org",
|
||||
"reuters.com/fact-check",
|
||||
"apnews.com/hub/ap-fact-check",
|
||||
"bbc.com/news/reality_check",
|
||||
"fullfact.org",
|
||||
"afp.com/fact-check",
|
||||
"truthorfiction.com",
|
||||
"leadstories.com",
|
||||
"checkyourfact.com",
|
||||
"washingtonpost.com/news/fact-checker",
|
||||
"factcheck.kz",
|
||||
"poynter.org/ifcn",
|
||||
"factcheckeu.info",
|
||||
"africacheck.org",
|
||||
"thequint.com/webqoof",
|
||||
"altnews.in",
|
||||
"facta.news",
|
||||
"factcheckni.org",
|
||||
"mythdetector.ge",
|
||||
"verificado.mx",
|
||||
"euvsdisinfo.eu",
|
||||
"factcheck.afp.com",
|
||||
"newtral.es",
|
||||
"maldita.es",
|
||||
"faktograf.hr",
|
||||
"demagog.org.pl",
|
||||
"factnameh.com",
|
||||
"faktiskt.se",
|
||||
"teyit.org",
|
||||
"factly.in",
|
||||
"boom.live",
|
||||
"stopfake.org",
|
||||
"factcheck.ge",
|
||||
"factcheck.kg",
|
||||
"factcheck.uz",
|
||||
"factcheck.tj",
|
||||
"factcheck.az",
|
||||
"factcheck.am",
|
||||
"factcheck.md",
|
||||
"verafiles.org",
|
||||
"rappler.com/fact-check",
|
||||
"vera.com.gt",
|
||||
"chequeado.com",
|
||||
"aosfatos.org",
|
||||
"lasillavacia.com/detector-mentiras",
|
||||
"colombiacheck.com",
|
||||
"ecuadorchequea.com",
|
||||
"elsurti.com/checado",
|
||||
"verificat.cat",
|
||||
"mafindo.or.id",
|
||||
"tempo.co/cek-fakta",
|
||||
"factcheck.mk",
|
||||
"raskrinkavanje.ba",
|
||||
"faktograf.hr",
|
||||
"demagog.cz",
|
||||
"faktabaari.fi",
|
||||
"correctiv.org",
|
||||
"mimikama.at",
|
||||
"factcheck.vlaanderen",
|
||||
"factuel.afp.com",
|
||||
"nieuwscheckers.nl",
|
||||
"faktisk.no",
|
||||
"tjekdet.dk",
|
||||
"ellinikahoaxes.gr",
|
||||
"faktograf.id",
|
||||
"stopfake.kz",
|
||||
"pesacheck.org",
|
||||
"dubawa.org",
|
||||
"namibiafactcheck.org.na",
|
||||
"zimfact.org",
|
||||
"ghanafact.com",
|
||||
"factspace.africa",
|
||||
"factcrescendo.com",
|
||||
"vishvasnews.com",
|
||||
"factcheck.lk",
|
||||
"newschecker.in",
|
||||
"boomlive.in",
|
||||
"digiteye.in",
|
||||
"indiatoday.in/fact-check",
|
||||
"factcrescendo.com",
|
||||
"piyasa.com/fact-check",
|
||||
"taiwanese.facts.news",
|
||||
"taiwanfactcheck.com",
|
||||
"mygopen.com",
|
||||
"tfc-taiwan.org.tw",
|
||||
"cofacts.tw",
|
||||
"rumor.taipei",
|
||||
"fact.qq.com",
|
||||
"factcheck.afp.com/list",
|
||||
"acfta.org",
|
||||
"crosscheck.firstdraftnews.org",
|
||||
"healthfeedback.org",
|
||||
"climatefeedback.org",
|
||||
"sciencefeedback.co",
|
||||
"factcheck.aap.com.au",
|
||||
"emergent.info",
|
||||
"hoax-slayer.net",
|
||||
"truthorfiction.com",
|
||||
"factcheck.media",
|
||||
"mediawise.org",
|
||||
"thejournal.ie/factcheck",
|
||||
"journalistsresource.org",
|
||||
"metafact.io",
|
||||
"reporterslab.org/fact-checking"
|
||||
]
|
||||
],
|
||||
"news_sites": [
|
||||
FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
|
||||
for domain in [
|
||||
"www.thedailystar.net",
|
||||
"www.thefinancialexpress.com.bd",
|
||||
"www.theindependentbd.com",
|
||||
"www.dhakatribune.com",
|
||||
"www.newagebd.net",
|
||||
"www.observerbd.com",
|
||||
"www.daily-sun.com",
|
||||
"www.tbsnews.net",
|
||||
"www.businesspostbd.com",
|
||||
"www.banglanews24.com/english",
|
||||
"www.bdnews24.com/english",
|
||||
"www.risingbd.com/english",
|
||||
"www.dailyindustry.news",
|
||||
"www.bangladeshpost.net",
|
||||
"www.daily-bangladesh.com/english"
|
||||
]
|
||||
]
|
||||
}
|
||||
|
||||
async def fetch_fact_checks(
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
query: str,
|
||||
site: FactCheckSource
|
||||
) -> Dict:
|
||||
"""
|
||||
Fetch fact checks from a specific site using the Google Fact Check API
|
||||
"""
|
||||
try:
|
||||
if not api_key or not base_url:
|
||||
raise ValueError("API key or base URL not configured")
|
||||
|
||||
params = {
|
||||
"key": api_key,
|
||||
"query": query,
|
||||
"languageCode": "en-US",
|
||||
"reviewPublisherSiteFilter": site.domain,
|
||||
"pageSize": 10
|
||||
}
|
||||
|
||||
response = requests.get(base_url, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.RequestException as e:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail=ErrorResponse(
|
||||
detail=f"Error fetching from {site.domain}: {str(e)}",
|
||||
error_code="FACT_CHECK_SERVICE_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail=str(e),
|
||||
error_code="CONFIGURATION_ERROR",
|
||||
path="/check-facts"
|
||||
).dict()
|
||||
)
|
||||
|
||||
def get_all_sources() -> List[FactCheckSource]:
|
||||
"""
|
||||
Get all sources sorted by priority
|
||||
"""
|
||||
# all_sources = SOURCES["fact_checkers"] + SOURCES["news_sites"]
|
||||
all_sources = SOURCES["fact_checkers"]
|
||||
return sorted(all_sources, key=lambda x: x.priority)
|
||||
4
main.py
4
main.py
|
|
@ -1,6 +1,8 @@
|
|||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.api.fact_check import fact_check_router
|
||||
from app.api.ai_fact_check import aifact_check_router
|
||||
from app.api.scrap_websites import scrap_websites_router
|
||||
from app.config import FRONTEND_URL
|
||||
|
||||
# Initialize FastAPI app
|
||||
|
|
@ -39,6 +41,8 @@ async def health_check():
|
|||
return {"status": "healthy"}
|
||||
|
||||
app.include_router(fact_check_router, prefix="")
|
||||
app.include_router(aifact_check_router, prefix="")
|
||||
app.include_router(scrap_websites_router, prefix="")
|
||||
|
||||
# Include routers (uncomment and modify as needed)
|
||||
# from routes import some_router
|
||||
|
|
|
|||
28
search_response_altnews_in.html
Normal file
28
search_response_altnews_in.html
Normal file
File diff suppressed because one or more lines are too long
28
search_response_bbc_com.html
Normal file
28
search_response_bbc_com.html
Normal file
File diff suppressed because one or more lines are too long
28
search_response_en_prothomalo_com.html
Normal file
28
search_response_en_prothomalo_com.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue