fact-checker-backend/app/api/scrap_websites.py
2024-12-15 18:22:04 +06:00

342 lines
No EOL
13 KiB
Python

from fastapi import APIRouter, HTTPException
import httpx
import logging
from urllib.parse import urlparse
import json
from app.services.openai_client import OpenAIClient
from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID
from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check
from typing import List, Dict, Optional
from pydantic import BaseModel
from app.models.fact_check_models import (
AIFactCheckRequest,
FactCheckSource,
SourceType
)
# Define Pydantic models
class Publisher(BaseModel):
name: str
site: str
class ClaimReview(BaseModel):
publisher: Publisher
textualRating: str
class Claim(BaseModel):
claimReview: List[ClaimReview]
claimant: str
text: str
class Summary(BaseModel):
fact_checking_sites_queried: int
total_sources: int
class VerificationResult(BaseModel):
verdict: str
confidence: str
evidence: List[str]
reasoning: str
fact_check_type: str
class SearchRequest(BaseModel):
search_text: str
source_types: List[str]
class EnhancedFactCheckResponse(BaseModel):
query: str
results: List[Dict]
sources: List
summary: Summary
token_usage: Dict[str, int]
total_claims_found: int
verification_result: VerificationResult
# Configure logging
logging.basicConfig(
level=logging.INFO, # Changed back to INFO from DEBUG
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
scrap_websites_router = APIRouter()
# Constants
RESULTS_PER_PAGE = 10
MAX_PAGES = 5
MAX_URLS_PER_DOMAIN = 5
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
def get_domain_from_url(url: str) -> str:
"""Extract domain from URL with improved handling."""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove 'www.' if present
if domain.startswith('www.'):
domain = domain[4:]
return domain
except Exception as e:
logger.error(f"Error extracting domain from URL {url}: {str(e)}")
return ""
def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
"""Check if domain matches any source with improved matching logic."""
if not domain:
return False
domain = domain.lower()
if domain.startswith('www.'):
domain = domain[4:]
for source in sources:
source_domain = source.domain.lower()
if source_domain.startswith('www.'):
source_domain = source_domain[4:]
# Check exact match
if domain == source_domain:
logger.debug(f"Exact domain match found: {domain} = {source_domain}")
return True
# Check if domain ends with source domain
if domain.endswith('.' + source_domain):
logger.debug(f"Subdomain match found: {domain} ends with {source_domain}")
return True
logger.debug(f"No match found for domain: {domain}")
return False
async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
"""Build search query with site restrictions."""
site_queries = [f"site:{source.domain}" for source in sources]
site_restriction = " OR ".join(site_queries)
enhanced_query = f"({query}) ({site_restriction})"
logger.debug(f"Enhanced search query: {enhanced_query}")
return enhanced_query
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
"""Perform Google Custom Search with enhanced query."""
enhanced_query = await build_enhanced_search_query(query, sources)
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
params = {
"key": GOOGLE_API_KEY,
"cx": GOOGLE_ENGINE_ID,
"q": enhanced_query,
"num": RESULTS_PER_PAGE,
"start": start_index
}
async with httpx.AsyncClient(timeout=30.0) as client:
try:
logger.info(f"Making API request to Google Custom Search with params: {params}")
response = await client.get(GOOGLE_SEARCH_URL, params=params)
response.raise_for_status()
data = response.json()
search_info = data.get('searchInformation', {})
logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, "
f"Time taken: {search_info.get('searchTime', 0)}s")
if 'error' in data:
error_details = data['error']
logger.error(f"API Error: {error_details}")
raise HTTPException(
status_code=response.status_code,
detail=f"Google API Error: {error_details.get('message')}"
)
return data
except Exception as e:
logger.error(f"Search error: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict:
"""Analyze fact check results using OpenAI to generate a consolidated verdict."""
# Extract verification results from sources
verification_results = []
for url, result in original_response.get('verification_result', {}).items():
verification_results.append(f"""
Source: {url}
Verdict: {result.get('verdict')}
Confidence: {result.get('confidence')}
Evidence: {result.get('evidence')}
Reasoning: {result.get('reasoning')}
""")
system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results
and provide a consolidated verdict. Respond with a valid JSON object containing your analysis."""
user_prompt = f"""
Analyze these fact-checking results and provide a final verdict.
Query: {original_response.get('query', '')}
Fact Check Results:
{'\n'.join(verification_results)}"""
try:
logger.info("Generating AI analysis of fact check results")
response = await openai_client.generate_text_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_tokens=2000
)
# Create the enhanced result structure
enhanced_result = {
"query": original_response.get('query', ''),
"results": [
{
"claimReview": [
{
"publisher": {
"name": source,
"site": source
},
"textualRating": result.get('verdict', '')
} for source in original_response.get('sources', [])
],
"claimant": "source",
"text": original_response.get('query', '')
}
],
"sources": original_response.get('sources', []),
"summary": {
"fact_checking_sites_queried": len(original_response.get('sources', [])),
"total_sources": len(original_response.get('verification_result', {}))
},
"verification_result": {
"verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''),
"confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''),
"evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')],
"reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''),
"fact_check_type": "ai fact checker"
},
"token_usage": original_response.get('token_usage', {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
})
}
enhanced_result["total_claims_found"] = len(enhanced_result.get("results", []))
logger.info("Successfully generated AI analysis")
return enhanced_result
except Exception as e:
logger.error(f"Error in OpenAI analysis: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}")
@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse)
async def search_websites(request: SearchRequest):
logger.info(f"Starting search with query: {request.search_text}")
logger.info(f"Source types requested: {request.source_types}")
# Get sources for requested types
selected_sources = []
for source_type in request.source_types:
if source_type in SOURCES:
selected_sources.extend(SOURCES[source_type])
if not selected_sources:
logger.warning("No valid source types provided. Using all available sources.")
selected_sources = get_all_sources()
logger.info(f"Selected sources: {[source.domain for source in selected_sources]}")
# Initialize collections for URLs
all_urls = []
domain_results = {}
try:
# Search and collect URLs
for page in range(1, MAX_PAGES + 1):
if len(all_urls) >= 50:
logger.info("Reached maximum URL limit of 50")
break
logger.info(f"Fetching page {page} of search results")
search_response = await google_custom_search(request.search_text, selected_sources, page)
if not search_response or not search_response.get("items"):
logger.warning(f"No results found on page {page}")
break
for item in search_response.get("items", []):
url = item.get("link")
if not url:
continue
domain = get_domain_from_url(url)
logger.debug(f"Processing URL: {url} with domain: {domain}")
if is_valid_source_domain(domain, selected_sources):
if domain not in domain_results:
domain_results[domain] = []
if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
domain_results[domain].append({
"url": url,
"title": item.get("title", ""),
"snippet": item.get("snippet", "")
})
all_urls.append(url)
else:
logger.debug(f"Skipping URL {url} - domain not in allowed list")
if len(all_urls) >= 50:
break
logger.info(f"Total URLs collected: {len(all_urls)}")
if not all_urls:
return EnhancedFactCheckResponse(
query=request.search_text,
results=[],
sources=[],
summary=Summary(
fact_checking_sites_queried=len(selected_sources),
total_sources=0
),
token_usage={
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
},
total_claims_found=0,
verification_result=VerificationResult(
verdict="Insufficient Evidence",
confidence="Low",
evidence=["No relevant sources found"],
reasoning="No fact-checking sources were found for this claim",
fact_check_type="ai fact checker"
)
)
# Perform fact check with collected URLs
fact_check_request = AIFactCheckRequest(
content=request.search_text,
urls=all_urls[:5] # Limit to 5 URLs
)
logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs")
fact_check_response = await ai_fact_check(fact_check_request)
# Get enhanced analysis
openai_client = OpenAIClient(OPENAI_API_KEY)
enhanced_response = await analyze_fact_check_results(
openai_client,
fact_check_response.dict()
)
return EnhancedFactCheckResponse(**enhanced_response)
except Exception as e:
logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))