Dev #1

Merged
utshodey merged 5 commits from dev into master 2024-12-17 11:33:44 +00:00
7 changed files with 390 additions and 311 deletions
Showing only changes of commit 790d58402a - Show all commits

View file

@ -2,7 +2,7 @@ from fastapi import APIRouter, HTTPException
import json import json
from datetime import datetime from datetime import datetime
from typing import Dict, List from typing import Dict, List
import httpx
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
from app.models.fact_check_models import ( from app.models.fact_check_models import (
GoogleFactCheckRequest as FactCheckRequest, GoogleFactCheckRequest as FactCheckRequest,
@ -12,7 +12,6 @@ from app.models.fact_check_models import (
TokenUsage TokenUsage
) )
from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources
from app.api.scrap_websites import SearchRequest, search_websites
fact_check_router = APIRouter() fact_check_router = APIRouter()
@ -22,6 +21,39 @@ class CustomJSONEncoder(json.JSONEncoder):
return obj.isoformat() return obj.isoformat()
return super().default(obj) return super().default(obj)
async def validate_api_key():
"""Validate the Google API key with a test request"""
async with httpx.AsyncClient() as client:
try:
test_url = f"{GOOGLE_FACT_CHECK_BASE_URL}claims:search"
params = {
"key": GOOGLE_API_KEY,
"query": "test",
"languageCode": "en-US",
"pageSize": 1
}
response = await client.get(test_url, params=params)
response.raise_for_status()
return True
except httpx.HTTPStatusError as e:
if e.response.status_code == 403:
raise HTTPException(
status_code=503,
detail=ErrorResponse(
detail="Invalid or expired API key",
error_code="INVALID_API_KEY",
path="/check-facts"
).dict()
)
raise HTTPException(
status_code=503,
detail=ErrorResponse(
detail=f"API validation failed: {str(e)}",
error_code="API_VALIDATION_ERROR",
path="/check-facts"
).dict()
)
@fact_check_router.post( @fact_check_router.post(
"/check-facts", "/check-facts",
response_model=FactCheckResponse, response_model=FactCheckResponse,
@ -34,7 +66,7 @@ class CustomJSONEncoder(json.JSONEncoder):
) )
async def check_facts(request: FactCheckRequest) -> FactCheckResponse: async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
""" """
Check facts using multiple fact-checking sources and fallback to web search Check facts using multiple fact-checking sources
""" """
all_results = [] all_results = []
verified_results = [] verified_results = []
@ -50,10 +82,14 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
).dict() ).dict()
) )
# Validate API key before proceeding
await validate_api_key()
# Get all sources in priority order # Get all sources in priority order
all_sources = get_all_sources() all_sources = get_all_sources()
all_sources_list = [] # To store source URLs all_sources_list = [] # To store source URLs
contexts_used = [] # To store context snippets contexts_used = [] # To store context snippets
failed_sources = [] # Track failed sources
for source in all_sources: for source in all_sources:
try: try:
@ -78,75 +114,39 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
if "textualRating" in review: if "textualRating" in review:
contexts_used.append(review["textualRating"]) contexts_used.append(review["textualRating"])
except HTTPException: except HTTPException as http_err:
failed_sources.append({
"source": source.domain,
"error": str(http_err.detail)
})
continue continue
except Exception as e: except Exception as e:
# Log the error but continue with other sources failed_sources.append({
print(f"Error processing {source.domain}: {str(e)}") "source": source.domain,
"error": str(e)
})
continue continue
# If no results found, try searching websites # Return partial results if some sources failed but we have data
if not all_results: if all_results:
try: verification_result = {
# Create search request "verdict": "Partial Results Available" if failed_sources else "Complete Results",
search_request = SearchRequest( "confidence": "Medium" if failed_sources else "High",
search_text=request.content, "evidence": contexts_used,
source_types=["fact_checkers"] "reasoning": "Based on available fact checks",
) "missing_info": f"{len(failed_sources)} sources failed" if failed_sources else None
}
# Perform website search else:
search_response = await search_websites(search_request)
# If AI fact check results are available, use them
if search_response.ai_fact_check_result:
# Create a claim from AI fact check result
ai_claim = {
"text": request.content,
"claimant": "AI Analysis",
"claimDate": datetime.now().isoformat(),
"claimReview": [{
"publisher": {
"name": "AI Fact Checker",
"site": "ai-fact-check"
},
"textualRating": search_response.ai_fact_check_result.verification_result["verdict"],
"title": "AI Fact Check Analysis",
"reviewDate": datetime.now().isoformat(),
"url": ""
}]
}
validated_claim = Claim(**ai_claim).dict()
all_results.append(validated_claim)
# Add sources and contexts
all_sources_list.extend(search_response.results.keys())
if search_response.ai_fact_check_result.verification_result["evidence"]:
contexts_used.extend(search_response.ai_fact_check_result.verification_result["evidence"])
except Exception as e:
print(f"Error during website search: {str(e)}")
# If still no results found after searching websites
if not all_results:
raise HTTPException( raise HTTPException(
status_code=404, status_code=404,
detail=ErrorResponse( detail=ErrorResponse(
detail="No fact check results found", detail="No fact check results found. Failed sources: " +
", ".join([f"{f['source']}: {f['error']}" for f in failed_sources]),
error_code="NO_RESULTS_FOUND", error_code="NO_RESULTS_FOUND",
path="/check-facts" path="/check-facts"
).dict() ).dict()
) )
# Prepare the verification result
verification_result = {
"verdict": "Insufficient Information", # Default verdict
"confidence": "Low",
"evidence": contexts_used,
"reasoning": "Based on available fact checks and web search results",
"missing_info": "Additional verification may be needed"
}
# Create token usage information # Create token usage information
token_usage = TokenUsage( token_usage = TokenUsage(
prompt_tokens=0, prompt_tokens=0,
@ -161,10 +161,12 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
results=all_results, results=all_results,
verification_result=verification_result, verification_result=verification_result,
sources=list(set(all_sources_list)), sources=list(set(all_sources_list)),
context_used=contexts_used,
token_usage=token_usage, token_usage=token_usage,
summary={ summary={
"total_sources": len(set(all_sources_list)), "total_sources": len(set(all_sources_list)),
"fact_checking_sites_queried": len(all_sources) "fact_checking_sites_queried": len(all_sources),
"failed_sources": failed_sources
} }
) )

View file

@ -1,309 +1,342 @@
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException
from pydantic import BaseModel import httpx
from typing import List, Dict, Optional
from urllib.parse import urlencode, urlparse
import urllib.parse
import numpy as np
from time import sleep
import logging import logging
import requests from urllib.parse import urlparse
from bs4 import BeautifulSoup import json
import re
from app.services.openai_client import OpenAIClient from app.services.openai_client import OpenAIClient
from app.config import OPENAI_API_KEY from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID
from app.websites.fact_checker_website import SOURCES, get_all_sources from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check from app.api.ai_fact_check import ai_fact_check
from typing import List, Dict, Optional
from pydantic import BaseModel
from app.models.fact_check_models import ( from app.models.fact_check_models import (
AIFactCheckRequest, AIFactCheckRequest,
AIFactCheckResponse, FactCheckSource,
VerificationResult, SourceType
TokenUsage
) )
# Define Pydantic models
class Publisher(BaseModel):
name: str
site: str
class ClaimReview(BaseModel):
publisher: Publisher
textualRating: str
class Claim(BaseModel):
claimReview: List[ClaimReview]
claimant: str
text: str
class Summary(BaseModel):
fact_checking_sites_queried: int
total_sources: int
class VerificationResult(BaseModel):
verdict: str
confidence: str
evidence: List[str]
reasoning: str
fact_check_type: str
class SearchRequest(BaseModel):
search_text: str
source_types: List[str]
class EnhancedFactCheckResponse(BaseModel):
query: str
results: List[Dict]
sources: List
summary: Summary
token_usage: Dict[str, int]
total_claims_found: int
verification_result: VerificationResult
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO, # Changed back to INFO from DEBUG
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
scrap_websites_router = APIRouter() scrap_websites_router = APIRouter()
# Configuration for scraping # Constants
MAX_RETRIES = 2 RESULTS_PER_PAGE = 10
RETRY_DELAY = 2 MAX_PAGES = 5
MAX_URLS_PER_DOMAIN = 5
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
class SearchRequest(BaseModel): def get_domain_from_url(url: str) -> str:
search_text: str """Extract domain from URL with improved handling."""
source_types: List[str] = ["fact_checkers"]
class UrlSimilarityInfo(BaseModel):
url: str
similarity: float
extracted_text: str
class SearchResponse(BaseModel):
results: Dict[str, List[str]]
error_messages: Dict[str, str]
ai_fact_check_result: Optional[Dict] = None
def extract_url_text(url: str) -> str:
"""Extract and process meaningful text from URL path with improved cleaning"""
logger.debug(f"Extracting text from URL: {url}")
try: try:
parsed = urllib.parse.urlparse(url) parsed = urlparse(url)
path = parsed.path domain = parsed.netloc.lower()
path = path.replace('.html', '').replace('/index', '').replace('.php', '') # Remove 'www.' if present
segments = [seg for seg in path.split('/') if seg] if domain.startswith('www.'):
cleaned_segments = [] domain = domain[4:]
for segment in segments: return domain
segment = segment.replace('-', ' ').replace('_', ' ')
if not (segment.replace(' ', '').isdigit() or
all(part.isdigit() for part in segment.split() if part)):
cleaned_segments.append(segment)
common_words = {
'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk',
'updates', 'update', 'latest', 'breaking', 'new', 'article'
}
text = ' '.join(cleaned_segments)
words = [word.lower() for word in text.split()
if word.lower() not in common_words and len(word) > 1]
result = ' '.join(words)
logger.debug(f"Extracted text: {result}")
return result
except Exception as e: except Exception as e:
logger.error(f"Error extracting text from URL {url}: {str(e)}") logger.error(f"Error extracting domain from URL {url}: {str(e)}")
return '' return ""
def extract_search_results(html_content): def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
"""Extract URLs using multiple selectors and patterns""" """Check if domain matches any source with improved matching logic."""
soup = BeautifulSoup(html_content, 'html.parser') if not domain:
urls = set() # Using set to avoid duplicates return False
# Multiple CSS selectors to try domain = domain.lower()
selectors = [ if domain.startswith('www.'):
'div.g div.yuRUbf > a', # Main result links domain = domain[4:]
'div.g a.l', # Alternative link format
'div.rc a', # Another possible format
'div[class*="g"] > div > div > div > a', # Broader match
'a[href^="http"]' # Any http link
]
for selector in selectors: for source in sources:
try: source_domain = source.domain.lower()
elements = soup.select(selector) if source_domain.startswith('www.'):
for element in elements: source_domain = source_domain[4:]
url = element.get('href')
if url and url.startswith('http') and not url.startswith('https://www.google.com'):
urls.add(url)
except Exception as e:
logger.debug(f"Error with selector {selector}: {str(e)}")
# Also try finding URLs in the raw HTML using regex # Check exact match
url_pattern = r'href="(https?://[^"]+)"' if domain == source_domain:
raw_urls = re.findall(url_pattern, html_content) logger.debug(f"Exact domain match found: {domain} = {source_domain}")
for url in raw_urls: return True
if not url.startswith('https://www.google.com'):
urls.add(url)
return list(urls) # Check if domain ends with source domain
if domain.endswith('.' + source_domain):
logger.debug(f"Subdomain match found: {domain} ends with {source_domain}")
return True
def google_search_scraper(search_text: str, site_domain: str, retry_count: int = 0) -> List[str]: logger.debug(f"No match found for domain: {domain}")
"""Scrape Google search results with multiple query formats""" return False
logger.info(f"Searching for '{search_text}' on domain: {site_domain}")
headers = { async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', """Build search query with site restrictions."""
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', site_queries = [f"site:{source.domain}" for source in sources]
'Accept-Language': 'en-US,en;q=0.5', site_restriction = " OR ".join(site_queries)
'Referer': 'https://www.google.com/', enhanced_query = f"({query}) ({site_restriction})"
'DNT': '1' logger.debug(f"Enhanced search query: {enhanced_query}")
return enhanced_query
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
"""Perform Google Custom Search with enhanced query."""
enhanced_query = await build_enhanced_search_query(query, sources)
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
params = {
"key": GOOGLE_API_KEY,
"cx": GOOGLE_ENGINE_ID,
"q": enhanced_query,
"num": RESULTS_PER_PAGE,
"start": start_index
} }
# Try different query formats async with httpx.AsyncClient(timeout=30.0) as client:
query_formats = [
f"{search_text} site:{site_domain}",
f"site:{site_domain} {search_text}",
f"\"{search_text}\" site:{site_domain}"
]
all_urls = set()
for query in query_formats:
try: try:
google_url = f"https://www.google.com/search?q={urlencode({'q': query})}" logger.info(f"Making API request to Google Custom Search with params: {params}")
logger.debug(f"Trying query format: {query}") response = await client.get(GOOGLE_SEARCH_URL, params=params)
response.raise_for_status()
response = requests.get(google_url, headers=headers) data = response.json()
if response.status_code == 200: search_info = data.get('searchInformation', {})
urls = extract_search_results(response.text) logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, "
domain_urls = [url for url in urls if site_domain in urlparse(url).netloc] f"Time taken: {search_info.get('searchTime', 0)}s")
all_urls.update(domain_urls)
else:
logger.warning(f"Received status code {response.status_code} for query format: {query}")
sleep(2) # Delay between requests if 'error' in data:
error_details = data['error']
logger.error(f"API Error: {error_details}")
raise HTTPException(
status_code=response.status_code,
detail=f"Google API Error: {error_details.get('message')}"
)
return data
except Exception as e: except Exception as e:
logger.error(f"Error processing query format '{query}': {str(e)}") logger.error(f"Search error: {str(e)}", exc_info=True)
if retry_count < MAX_RETRIES: raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
sleep(RETRY_DELAY)
return google_search_scraper(search_text, site_domain, retry_count + 1)
valid_urls = list(all_urls) async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict:
logger.info(f"Found {len(valid_urls)} unique URLs for domain: {site_domain}") """Analyze fact check results using OpenAI to generate a consolidated verdict."""
return valid_urls[:5] # Return up to 5 URLs
def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float: # Extract verification results from sources
"""Calculate cosine similarity between two embeddings""" verification_results = []
query_array = np.array(query_embedding) for url, result in original_response.get('verification_result', {}).items():
url_array = np.array(url_embedding) verification_results.append(f"""
Source: {url}
Verdict: {result.get('verdict')}
Confidence: {result.get('confidence')}
Evidence: {result.get('evidence')}
Reasoning: {result.get('reasoning')}
""")
similarity = np.dot(url_array, query_array) / ( system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results
np.linalg.norm(url_array) * np.linalg.norm(query_array) and provide a consolidated verdict. Respond with a valid JSON object containing your analysis."""
)
return float(similarity)
user_prompt = f"""
Analyze these fact-checking results and provide a final verdict.
@scrap_websites_router.post("/search", response_model=SearchResponse) Query: {original_response.get('query', '')}
Fact Check Results:
{'\n'.join(verification_results)}"""
try:
logger.info("Generating AI analysis of fact check results")
response = await openai_client.generate_text_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_tokens=2000
)
# Create the enhanced result structure
enhanced_result = {
"query": original_response.get('query', ''),
"results": [
{
"claimReview": [
{
"publisher": {
"name": source,
"site": source
},
"textualRating": result.get('verdict', '')
} for source in original_response.get('sources', [])
],
"claimant": "source",
"text": original_response.get('query', '')
}
],
"sources": original_response.get('sources', []),
"summary": {
"fact_checking_sites_queried": len(original_response.get('sources', [])),
"total_sources": len(original_response.get('verification_result', {}))
},
"verification_result": {
"verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''),
"confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''),
"evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')],
"reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''),
"fact_check_type": "ai fact checker"
},
"token_usage": original_response.get('token_usage', {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
})
}
enhanced_result["total_claims_found"] = len(enhanced_result.get("results", []))
logger.info("Successfully generated AI analysis")
return enhanced_result
except Exception as e:
logger.error(f"Error in OpenAI analysis: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}")
@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse)
async def search_websites(request: SearchRequest): async def search_websites(request: SearchRequest):
logger.info(f"Starting search with query: {request.search_text}") logger.info(f"Starting search with query: {request.search_text}")
logger.info(f"Source types requested: {request.source_types}") logger.info(f"Source types requested: {request.source_types}")
results = {} # Get sources for requested types
error_messages = {} selected_sources = []
# Initialize OpenAI client
logger.debug("Initializing OpenAI client")
openai_client = OpenAIClient(OPENAI_API_KEY)
# Get domains based on requested source types
domains = []
for source_type in request.source_types: for source_type in request.source_types:
if source_type in SOURCES: if source_type in SOURCES:
domains.extend([source.domain for source in SOURCES[source_type]]) selected_sources.extend(SOURCES[source_type])
if not domains: if not selected_sources:
logger.warning("No valid source types provided. Using all available domains.") logger.warning("No valid source types provided. Using all available sources.")
domains = [source.domain for source in get_all_sources()] selected_sources = get_all_sources()
logger.info(f"Processing {len(domains)} domains") logger.info(f"Selected sources: {[source.domain for source in selected_sources]}")
# Enhance search text with key terms # Initialize collections for URLs
search_context = request.search_text all_urls = []
logger.debug("Getting query embedding from OpenAI") domain_results = {}
query_embedding = openai_client.get_embeddings([search_context])[0]
# Higher similarity threshold for better filtering try:
SIMILARITY_THRESHOLD = 0.75 # Search and collect URLs
MAX_URLS_PER_DOMAIN = 2 # Adjusted to ensure total stays under 5 for page in range(1, MAX_PAGES + 1):
TOTAL_MAX_URLS = 5 # Maximum URLs allowed for AIFactCheckRequest if len(all_urls) >= 50:
logger.info("Reached maximum URL limit of 50")
break
total_urls_collected = 0 logger.info(f"Fetching page {page} of search results")
for domain in domains[:3]: # Limit to 3 domains for testing search_response = await google_custom_search(request.search_text, selected_sources, page)
if total_urls_collected >= TOTAL_MAX_URLS:
break
logger.info(f"Processing domain: {domain}") if not search_response or not search_response.get("items"):
try: logger.warning(f"No results found on page {page}")
urls = google_search_scraper(request.search_text, domain) break
valid_urls = []
logger.debug(f"Found {len(urls)} URLs for domain {domain}") for item in search_response.get("items", []):
url = item.get("link")
for url in urls: if not url:
if len(valid_urls) >= MAX_URLS_PER_DOMAIN or total_urls_collected >= TOTAL_MAX_URLS:
break
url_text = extract_url_text(url)
if not url_text:
logger.debug(f"No meaningful text extracted from URL: {url}")
continue continue
logger.debug("Getting URL embedding from OpenAI") domain = get_domain_from_url(url)
url_embedding = openai_client.get_embeddings([url_text])[0] logger.debug(f"Processing URL: {url} with domain: {domain}")
similarity = calculate_similarity(query_embedding, url_embedding)
logger.debug(f"Similarity score for {url}: {similarity}") if is_valid_source_domain(domain, selected_sources):
if domain not in domain_results:
domain_results[domain] = []
if similarity >= SIMILARITY_THRESHOLD: if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
valid_urls.append(url) domain_results[domain].append({
total_urls_collected += 1 "url": url,
"title": item.get("title", ""),
"snippet": item.get("snippet", "")
})
all_urls.append(url)
else:
logger.debug(f"Skipping URL {url} - domain not in allowed list")
results[domain] = valid_urls if len(all_urls) >= 50:
logger.info(f"Successfully processed domain {domain}. Found {len(valid_urls)} valid URLs") break
except HTTPException as e: logger.info(f"Total URLs collected: {len(all_urls)}")
logger.error(f"HTTP Exception for domain {domain}: {str(e.detail)}")
error_messages[domain] = str(e.detail)
except Exception as e:
logger.error(f"Unexpected error for domain {domain}: {str(e)}")
error_messages[domain] = f"Unexpected error for {domain}: {str(e)}"
sleep(1) # Add delay between processing different domains if not all_urls:
return EnhancedFactCheckResponse(
logger.info("Search completed") query=request.search_text,
logger.debug(f"Results found for {len(results)} domains") results=[],
logger.debug(f"Errors encountered for {len(error_messages)} domains") sources=[],
summary=Summary(
# Collect all valid URLs from results fact_checking_sites_queried=len(selected_sources),
all_valid_urls = [] total_sources=0
for domain_urls in results.values(): ),
all_valid_urls.extend(domain_urls) token_usage={
"prompt_tokens": 0,
logger.info(f"Total valid URLs collected: {len(all_valid_urls)}") "completion_tokens": 0,
"total_tokens": 0
# Create request body for AI fact check
if all_valid_urls:
fact_check_request = AIFactCheckRequest(
content=request.search_text,
urls=all_valid_urls[:TOTAL_MAX_URLS] # Ensure we don't exceed the limit
)
logger.info("Calling AI fact check service")
try:
ai_response = await ai_fact_check(fact_check_request)
logger.info("AI fact check completed successfully")
# Format AI fact check response
formatted_response = {
"query": ai_response.query,
"token_usage": {
"prompt_tokens": ai_response.token_usage.prompt_tokens,
"completion_tokens": ai_response.token_usage.completion_tokens,
"total_tokens": ai_response.token_usage.total_tokens
}, },
"sources": ai_response.sources, total_claims_found=0,
"verification_result": { verification_result=VerificationResult(
url: { verdict="Insufficient Evidence",
"verdict": result.verdict, confidence="Low",
"confidence": result.confidence, evidence=["No relevant sources found"],
"evidence": result.evidence, reasoning="No fact-checking sources were found for this claim",
"reasoning": result.reasoning, fact_check_type="ai fact checker"
"missing_info": result.missing_info )
} for url, result in ai_response.verification_result.items()
}
}
# Return response with AI fact check results
return SearchResponse(
results=results,
error_messages=error_messages,
ai_fact_check_result=formatted_response
) )
except Exception as e: # Perform fact check with collected URLs
logger.error(f"Error during AI fact check: {str(e)}") fact_check_request = AIFactCheckRequest(
error_messages["ai_fact_check"] = f"Error during fact checking: {str(e)}" content=request.search_text,
urls=all_urls[:5] # Limit to 5 URLs
)
# Return response without AI fact check if no valid URLs or error occurred logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs")
return SearchResponse( fact_check_response = await ai_fact_check(fact_check_request)
results=results,
error_messages=error_messages, # Get enhanced analysis
ai_fact_check_result=None openai_client = OpenAIClient(OPENAI_API_KEY)
) enhanced_response = await analyze_fact_check_results(
openai_client,
fact_check_response.dict()
)
return EnhancedFactCheckResponse(**enhanced_response)
except Exception as e:
logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View file

@ -5,6 +5,7 @@ load_dotenv()
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"] GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"] GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
FRONTEND_URL = os.environ["FRONTEND_URL"] FRONTEND_URL = os.environ["FRONTEND_URL"]

View file

@ -0,0 +1,43 @@
from pydantic import BaseModel
from typing import List, Dict
class SearchRequest(BaseModel):
search_text: str
source_types: List[str] = ["fact_checkers"]
class Publisher(BaseModel):
name: str
site: str
class ClaimReview(BaseModel):
publisher: Publisher
textualRating: str
class Claim(BaseModel):
claimReview: List[ClaimReview]
claimant: str
text: str
class Summary(BaseModel):
fact_checking_sites_queried: int
total_sources: int
class TokenUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class VerificationResult(BaseModel):
verdict: str
confidence: str
evidence: List[str]
reasoning: str
class EnhancedFactCheckResponse(BaseModel):
query: str
results: List[Claim]
sources: List[str]
summary: Summary
token_usage: Dict[str, int]
total_claims_found: int
verification_result: VerificationResult