from fastapi import APIRouter, HTTPException import httpx import logging from urllib.parse import urlparse import json from app.services.openai_client import OpenAIClient from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID from app.websites.fact_checker_website import SOURCES, get_all_sources from app.api.ai_fact_check import ai_fact_check from typing import List, Dict, Optional from pydantic import BaseModel from app.models.fact_check_models import ( AIFactCheckRequest, FactCheckSource, SourceType ) # Define Pydantic models class Publisher(BaseModel): name: str site: str class ClaimReview(BaseModel): publisher: Publisher textualRating: str class Claim(BaseModel): claimReview: List[ClaimReview] claimant: str text: str class Summary(BaseModel): fact_checking_sites_queried: int total_sources: int class VerificationResult(BaseModel): verdict: str confidence: str evidence: List[str] reasoning: str fact_check_type: str class SearchRequest(BaseModel): search_text: str source_types: List[str] class EnhancedFactCheckResponse(BaseModel): query: str results: List[Dict] sources: List summary: Summary token_usage: Dict[str, int] total_claims_found: int verification_result: VerificationResult # Configure logging logging.basicConfig( level=logging.INFO, # Changed back to INFO from DEBUG format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) scrap_websites_router = APIRouter() # Constants RESULTS_PER_PAGE = 10 MAX_PAGES = 5 MAX_URLS_PER_DOMAIN = 5 GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1" def get_domain_from_url(url: str) -> str: """Extract domain from URL with improved handling.""" try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove 'www.' if present if domain.startswith('www.'): domain = domain[4:] return domain except Exception as e: logger.error(f"Error extracting domain from URL {url}: {str(e)}") return "" def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool: """Check if domain matches any source with improved matching logic.""" if not domain: return False domain = domain.lower() if domain.startswith('www.'): domain = domain[4:] for source in sources: source_domain = source.domain.lower() if source_domain.startswith('www.'): source_domain = source_domain[4:] # Check exact match if domain == source_domain: logger.debug(f"Exact domain match found: {domain} = {source_domain}") return True # Check if domain ends with source domain if domain.endswith('.' + source_domain): logger.debug(f"Subdomain match found: {domain} ends with {source_domain}") return True logger.debug(f"No match found for domain: {domain}") return False async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str: """Build search query with site restrictions.""" site_queries = [f"site:{source.domain}" for source in sources] site_restriction = " OR ".join(site_queries) enhanced_query = f"({query}) ({site_restriction})" logger.debug(f"Enhanced search query: {enhanced_query}") return enhanced_query async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]: """Perform Google Custom Search with enhanced query.""" enhanced_query = await build_enhanced_search_query(query, sources) start_index = ((page - 1) * RESULTS_PER_PAGE) + 1 params = { "key": GOOGLE_API_KEY, "cx": GOOGLE_ENGINE_ID, "q": enhanced_query, "num": RESULTS_PER_PAGE, "start": start_index } async with httpx.AsyncClient(timeout=30.0) as client: try: logger.info(f"Making API request to Google Custom Search with params: {params}") response = await client.get(GOOGLE_SEARCH_URL, params=params) response.raise_for_status() data = response.json() search_info = data.get('searchInformation', {}) logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, " f"Time taken: {search_info.get('searchTime', 0)}s") if 'error' in data: error_details = data['error'] logger.error(f"API Error: {error_details}") raise HTTPException( status_code=response.status_code, detail=f"Google API Error: {error_details.get('message')}" ) return data except Exception as e: logger.error(f"Search error: {str(e)}", exc_info=True) raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict: """Analyze fact check results using OpenAI to generate a consolidated verdict.""" # Extract verification results from sources verification_results = [] for url, result in original_response.get('verification_result', {}).items(): verification_results.append(f""" Source: {url} Verdict: {result.get('verdict')} Confidence: {result.get('confidence')} Evidence: {result.get('evidence')} Reasoning: {result.get('reasoning')} """) system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results and provide a consolidated verdict. Respond with a valid JSON object containing your analysis.""" user_prompt = f""" Analyze these fact-checking results and provide a final verdict. Query: {original_response.get('query', '')} Fact Check Results: {'\n'.join(verification_results)}""" try: logger.info("Generating AI analysis of fact check results") response = await openai_client.generate_text_response( system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=2000 ) # Create the enhanced result structure enhanced_result = { "query": original_response.get('query', ''), "results": [ { "claimReview": [ { "publisher": { "name": source, "site": source }, "textualRating": result.get('verdict', '') } for source in original_response.get('sources', []) ], "claimant": "source", "text": original_response.get('query', '') } ], "sources": original_response.get('sources', []), "summary": { "fact_checking_sites_queried": len(original_response.get('sources', [])), "total_sources": len(original_response.get('verification_result', {})) }, "verification_result": { "verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''), "confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''), "evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')], "reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''), "fact_check_type": "ai fact checker" }, "token_usage": original_response.get('token_usage', { "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0 }) } enhanced_result["total_claims_found"] = len(enhanced_result.get("results", [])) logger.info("Successfully generated AI analysis") return enhanced_result except Exception as e: logger.error(f"Error in OpenAI analysis: {str(e)}") raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}") @scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse) async def search_websites(request: SearchRequest): logger.info(f"Starting search with query: {request.search_text}") logger.info(f"Source types requested: {request.source_types}") # Get sources for requested types selected_sources = [] for source_type in request.source_types: if source_type in SOURCES: selected_sources.extend(SOURCES[source_type]) if not selected_sources: logger.warning("No valid source types provided. Using all available sources.") selected_sources = get_all_sources() logger.info(f"Selected sources: {[source.domain for source in selected_sources]}") # Initialize collections for URLs all_urls = [] domain_results = {} try: # Search and collect URLs for page in range(1, MAX_PAGES + 1): if len(all_urls) >= 50: logger.info("Reached maximum URL limit of 50") break logger.info(f"Fetching page {page} of search results") search_response = await google_custom_search(request.search_text, selected_sources, page) if not search_response or not search_response.get("items"): logger.warning(f"No results found on page {page}") break for item in search_response.get("items", []): url = item.get("link") if not url: continue domain = get_domain_from_url(url) logger.debug(f"Processing URL: {url} with domain: {domain}") if is_valid_source_domain(domain, selected_sources): if domain not in domain_results: domain_results[domain] = [] if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN: domain_results[domain].append({ "url": url, "title": item.get("title", ""), "snippet": item.get("snippet", "") }) all_urls.append(url) else: logger.debug(f"Skipping URL {url} - domain not in allowed list") if len(all_urls) >= 50: break logger.info(f"Total URLs collected: {len(all_urls)}") if not all_urls: return EnhancedFactCheckResponse( query=request.search_text, results=[], sources=[], summary=Summary( fact_checking_sites_queried=len(selected_sources), total_sources=0 ), token_usage={ "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0 }, total_claims_found=0, verification_result=VerificationResult( verdict="Insufficient Evidence", confidence="Low", evidence=["No relevant sources found"], reasoning="No fact-checking sources were found for this claim", fact_check_type="ai fact checker" ) ) # Perform fact check with collected URLs fact_check_request = AIFactCheckRequest( content=request.search_text, urls=all_urls[:5] # Limit to 5 URLs ) logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs") fact_check_response = await ai_fact_check(fact_check_request) # Get enhanced analysis openai_client = OpenAIClient(OPENAI_API_KEY) enhanced_response = await analyze_fact_check_results( openai_client, fact_check_response.dict() ) return EnhancedFactCheckResponse(**enhanced_response) except Exception as e: logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True) raise HTTPException(status_code=500, detail=str(e))