fact-checker-backend/app/api/scrap_websites.py

from fastapi import APIRouter, HTTPException
import httpx
import logging
from urllib.parse import urlparse
from typing import List, Dict, Optional
from pydantic import BaseModel
from app.models.ai_fact_check_models import (
    AIFactCheckRequest,
    FactCheckSource,
    SourceType
)
from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check
from app.config import GOOGLE_API_KEY, GOOGLE_ENGINE_ID, GOOGLE_SEARCH_URL


class SearchRequest(BaseModel):
    search_text: str
    source_types: List[str] = ["fact_checkers"]

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

scrap_websites_router = APIRouter()

# Constants
RESULTS_PER_PAGE = 10
MAX_PAGES = 5
MAX_URLS_PER_DOMAIN = 5


def get_domain_from_url(url: str) -> str:
    """Extract domain from URL with improved handling."""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        logger.error(f"Error extracting domain from URL {url}: {str(e)}")
        return ""

def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
    """Check if domain matches any source with improved matching logic."""
    if not domain:
        return False

    domain = domain.lower()
    if domain.startswith('www.'):
        domain = domain[4:]

    for source in sources:
        source_domain = source.domain.lower()
        if source_domain.startswith('www.'):
            source_domain = source_domain[4:]

        if domain == source_domain or domain.endswith('.' + source_domain):
            return True

    return False

async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
    """Build search query with site restrictions."""
    site_queries = [f"site:{source.domain}" for source in sources]
    site_restriction = " OR ".join(site_queries)
    return f"({query}) ({site_restriction})"

async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
    """Perform Google Custom Search with enhanced query."""
    enhanced_query = await build_enhanced_search_query(query, sources)
    start_index = ((page - 1) * RESULTS_PER_PAGE) + 1

    params = {
        "key": GOOGLE_API_KEY,
        "cx": GOOGLE_ENGINE_ID,
        "q": enhanced_query,
        "num": RESULTS_PER_PAGE,
        "start": start_index
    }

    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            response = await client.get(GOOGLE_SEARCH_URL, params=params)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Search error: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")

@scrap_websites_router.post("/search")
async def search_websites(request: SearchRequest):
    # Get the source types from the request
    source_types = request.source_types if request.source_types else ["fact_checkers"]

    # Get sources based on requested types
    selected_sources = []
    for source_type in source_types:
        if source_type in SOURCES:
            selected_sources.extend(SOURCES[source_type])

    # If no valid sources found, use fact checkers as default
    if not selected_sources:
        selected_sources = SOURCES["fact_checkers"]

    all_urls = []
    domain_results = {}

    try:
        for page in range(1, MAX_PAGES + 1):
            if len(all_urls) >= 50:
                break

            search_response = await google_custom_search(request.search_text, selected_sources, page)

            if not search_response or not search_response.get("items"):
                break

            for item in search_response.get("items", []):
                url = item.get("link")
                if not url:
                    continue

                domain = get_domain_from_url(url)

                if is_valid_source_domain(domain, selected_sources):
                    if domain not in domain_results:
                        domain_results[domain] = []

                    if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
                        domain_results[domain].append({
                            "url": url,
                            "title": item.get("title", ""),
                            "snippet": item.get("snippet", "")
                        })
                        all_urls.append(url)

                if len(all_urls) >= 50:
                    break

        if not all_urls:
            return {
                "status": "no_results",
                "urls_found": 0
            }

        fact_check_request = AIFactCheckRequest(
            content=request.search_text,
            urls=all_urls[:5]
        )

        return await ai_fact_check(fact_check_request)

    except Exception as e:
        logger.error(f"Error during search/fact-check process: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))