fact-checker-backend/app/api/scrap_websites.py

from fastapi import APIRouter, HTTPException
import httpx
import logging
from urllib.parse import urlparse
import json
from app.services.openai_client import OpenAIClient
from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID
from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check
from typing import List, Dict, Optional
from pydantic import BaseModel
from app.models.fact_check_models import (
    AIFactCheckRequest,
    FactCheckSource,
    SourceType
)

# Define Pydantic models
class Publisher(BaseModel):
    name: str
    site: str

class ClaimReview(BaseModel):
    publisher: Publisher
    textualRating: str

class Claim(BaseModel):
    claimReview: List[ClaimReview]
    claimant: str
    text: str

class Summary(BaseModel):
    fact_checking_sites_queried: int
    total_sources: int

class VerificationResult(BaseModel):
    verdict: str
    confidence: str
    evidence: List[str]
    reasoning: str
    fact_check_type: str

class SearchRequest(BaseModel):
    search_text: str
    source_types: List[str]

class EnhancedFactCheckResponse(BaseModel):
    query: str
    results: List[Dict]
    sources: List
    summary: Summary
    token_usage: Dict[str, int]
    total_claims_found: int
    verification_result: VerificationResult

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Changed back to INFO from DEBUG
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

scrap_websites_router = APIRouter()

# Constants
RESULTS_PER_PAGE = 10
MAX_PAGES = 5
MAX_URLS_PER_DOMAIN = 5
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"

def get_domain_from_url(url: str) -> str:
    """Extract domain from URL with improved handling."""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        # Remove 'www.' if present
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        logger.error(f"Error extracting domain from URL {url}: {str(e)}")
        return ""

def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
    """Check if domain matches any source with improved matching logic."""
    if not domain:
        return False

    domain = domain.lower()
    if domain.startswith('www.'):
        domain = domain[4:]

    for source in sources:
        source_domain = source.domain.lower()
        if source_domain.startswith('www.'):
            source_domain = source_domain[4:]

        # Check exact match
        if domain == source_domain:
            logger.debug(f"Exact domain match found: {domain} = {source_domain}")
            return True

        # Check if domain ends with source domain
        if domain.endswith('.' + source_domain):
            logger.debug(f"Subdomain match found: {domain} ends with {source_domain}")
            return True

    logger.debug(f"No match found for domain: {domain}")
    return False

async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
    """Build search query with site restrictions."""
    site_queries = [f"site:{source.domain}" for source in sources]
    site_restriction = " OR ".join(site_queries)
    enhanced_query = f"({query}) ({site_restriction})"
    logger.debug(f"Enhanced search query: {enhanced_query}")
    return enhanced_query

async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
    """Perform Google Custom Search with enhanced query."""
    enhanced_query = await build_enhanced_search_query(query, sources)
    start_index = ((page - 1) * RESULTS_PER_PAGE) + 1

    params = {
        "key": GOOGLE_API_KEY,
        "cx": GOOGLE_ENGINE_ID,
        "q": enhanced_query,
        "num": RESULTS_PER_PAGE,
        "start": start_index
    }

    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            logger.info(f"Making API request to Google Custom Search with params: {params}")
            response = await client.get(GOOGLE_SEARCH_URL, params=params)
            response.raise_for_status()

            data = response.json()

            search_info = data.get('searchInformation', {})
            logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, "
                       f"Time taken: {search_info.get('searchTime', 0)}s")

            if 'error' in data:
                error_details = data['error']
                logger.error(f"API Error: {error_details}")
                raise HTTPException(
                    status_code=response.status_code,
                    detail=f"Google API Error: {error_details.get('message')}"
                )

            return data

        except Exception as e:
            logger.error(f"Search error: {str(e)}", exc_info=True)
            raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")

async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict:
    """Analyze fact check results using OpenAI to generate a consolidated verdict."""

    # Extract verification results from sources
    verification_results = []
    for url, result in original_response.get('verification_result', {}).items():
        verification_results.append(f"""
        Source: {url}
        Verdict: {result.get('verdict')}
        Confidence: {result.get('confidence')}
        Evidence: {result.get('evidence')}
        Reasoning: {result.get('reasoning')}
        """)

    system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results
    and provide a consolidated verdict. Respond with a valid JSON object containing your analysis."""

    user_prompt = f"""
    Analyze these fact-checking results and provide a final verdict.

    Query: {original_response.get('query', '')}

    Fact Check Results:
    {'\n'.join(verification_results)}"""

    try:
        logger.info("Generating AI analysis of fact check results")
        response = await openai_client.generate_text_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            max_tokens=2000
        )

        # Create the enhanced result structure
        enhanced_result = {
            "query": original_response.get('query', ''),
            "results": [
                {
                    "claimReview": [
                        {
                            "publisher": {
                                "name": source,
                                "site": source
                            },
                            "textualRating": result.get('verdict', '')
                        } for source in original_response.get('sources', [])
                    ],
                    "claimant": "source",
                    "text": original_response.get('query', '')
                }
            ],
            "sources": original_response.get('sources', []),
            "summary": {
                "fact_checking_sites_queried": len(original_response.get('sources', [])),
                "total_sources": len(original_response.get('verification_result', {}))
            },
            "verification_result": {
                "verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''),
                "confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''),
                "evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')],
                "reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''),
                "fact_check_type": "ai fact checker"
            },
            "token_usage": original_response.get('token_usage', {
                "prompt_tokens": 0,
                "completion_tokens": 0,
                "total_tokens": 0
            })
        }

        enhanced_result["total_claims_found"] = len(enhanced_result.get("results", []))

        logger.info("Successfully generated AI analysis")
        return enhanced_result

    except Exception as e:
        logger.error(f"Error in OpenAI analysis: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}")
@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse)
async def search_websites(request: SearchRequest):
    logger.info(f"Starting search with query: {request.search_text}")
    logger.info(f"Source types requested: {request.source_types}")

    # Get sources for requested types
    selected_sources = []
    for source_type in request.source_types:
        if source_type in SOURCES:
            selected_sources.extend(SOURCES[source_type])

    if not selected_sources:
        logger.warning("No valid source types provided. Using all available sources.")
        selected_sources = get_all_sources()

    logger.info(f"Selected sources: {[source.domain for source in selected_sources]}")

    # Initialize collections for URLs
    all_urls = []
    domain_results = {}

    try:
        # Search and collect URLs
        for page in range(1, MAX_PAGES + 1):
            if len(all_urls) >= 50:
                logger.info("Reached maximum URL limit of 50")
                break

            logger.info(f"Fetching page {page} of search results")
            search_response = await google_custom_search(request.search_text, selected_sources, page)

            if not search_response or not search_response.get("items"):
                logger.warning(f"No results found on page {page}")
                break

            for item in search_response.get("items", []):
                url = item.get("link")
                if not url:
                    continue

                domain = get_domain_from_url(url)
                logger.debug(f"Processing URL: {url} with domain: {domain}")

                if is_valid_source_domain(domain, selected_sources):
                    if domain not in domain_results:
                        domain_results[domain] = []

                    if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
                        domain_results[domain].append({
                            "url": url,
                            "title": item.get("title", ""),
                            "snippet": item.get("snippet", "")
                        })
                        all_urls.append(url)
                else:
                    logger.debug(f"Skipping URL {url} - domain not in allowed list")

                if len(all_urls) >= 50:
                    break

        logger.info(f"Total URLs collected: {len(all_urls)}")

        if not all_urls:
            return EnhancedFactCheckResponse(
                query=request.search_text,
                results=[],
                sources=[],
                summary=Summary(
                    fact_checking_sites_queried=len(selected_sources),
                    total_sources=0
                ),
                token_usage={
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                    "total_tokens": 0
                },
                total_claims_found=0,
                verification_result=VerificationResult(
                    verdict="Insufficient Evidence",
                    confidence="Low",
                    evidence=["No relevant sources found"],
                    reasoning="No fact-checking sources were found for this claim",
                    fact_check_type="ai fact checker"
                )
            )

        # Perform fact check with collected URLs
        fact_check_request = AIFactCheckRequest(
            content=request.search_text,
            urls=all_urls[:5]  # Limit to 5 URLs
        )

        logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs")
        fact_check_response = await ai_fact_check(fact_check_request)

        # Get enhanced analysis
        openai_client = OpenAIClient(OPENAI_API_KEY)
        enhanced_response = await analyze_fact_check_results(
            openai_client,
            fact_check_response.dict()
        )

        return EnhancedFactCheckResponse(**enhanced_response)

    except Exception as e:
        logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))