fact-checker-backend/app/api/scrap_websites2.py
2024-12-14 18:19:37 +06:00

261 lines
No EOL
9.9 KiB
Python

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
import urllib.parse
import numpy as np
from time import sleep
import logging
from app.services.openai_client import OpenAIClient
from app.config import OPENAI_API_KEY
from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check
from app.models.fact_check_models import AIFactCheckRequest, AIFactCheckResponse
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
scrap_websites_router = APIRouter()
# Configuration for rate limiting
RATE_LIMIT_DELAY = 2 # Delay between requests in seconds
MAX_RETRIES = 1 # Maximum number of retries per domain
RETRY_DELAY = 1 # Delay between retries in seconds
class SearchRequest(BaseModel):
search_text: str
source_types: List[str] = ["fact_checkers"]
class UrlSimilarityInfo(BaseModel):
url: str
similarity: float
extracted_text: str
class SearchResponse(BaseModel):
results: Dict[str, List[str]]
error_messages: Dict[str, str]
ai_fact_check_result: Optional[AIFactCheckResponse] = None
def extract_url_text(url: str) -> str:
"""Extract and process meaningful text from URL path with improved cleaning"""
logger.debug(f"Extracting text from URL: {url}")
try:
parsed = urllib.parse.urlparse(url)
path = parsed.path
path = path.replace('.html', '').replace('/index', '').replace('.php', '')
segments = [seg for seg in path.split('/') if seg]
cleaned_segments = []
for segment in segments:
segment = segment.replace('-', ' ').replace('_', ' ')
if not (segment.replace(' ', '').isdigit() or
all(part.isdigit() for part in segment.split() if part)):
cleaned_segments.append(segment)
common_words = {
'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk',
'updates', 'update', 'latest', 'breaking', 'new', 'article'
}
text = ' '.join(cleaned_segments)
words = [word.lower() for word in text.split()
if word.lower() not in common_words and len(word) > 1]
result = ' '.join(words)
logger.debug(f"Extracted text: {result}")
return result
except Exception as e:
logger.error(f"Error extracting text from URL {url}: {str(e)}")
return ''
def google_search_scraper(search_text: str, site_domain: str, retry_count: int = 0) -> List[str]:
"""Scrape Google search results with retry logic and rate limiting"""
logger.info(f"Searching for '{search_text}' on domain: {site_domain} (Attempt {retry_count + 1}/{MAX_RETRIES})")
if retry_count >= MAX_RETRIES:
logger.error(f"Max retries exceeded for domain: {site_domain}")
raise HTTPException(
status_code=429,
detail=f"Max retries exceeded for {site_domain}"
)
query = f"{search_text} \"site:{site_domain}\""
encoded_query = urllib.parse.quote(query)
base_url = "https://www.google.com/search"
url = f"{base_url}?q={encoded_query}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
logger.debug(f"Waiting {RATE_LIMIT_DELAY} seconds before request")
sleep(RATE_LIMIT_DELAY)
logger.debug(f"Making request to Google Search for domain: {site_domain}")
response = requests.get(url, headers=headers)
if response.status_code == 429 or "sorry/index" in response.url:
logger.warning(f"Rate limit hit for domain {site_domain}. Retrying after delay...")
sleep(RETRY_DELAY)
return google_search_scraper(search_text, site_domain, retry_count + 1)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
search_results = soup.find_all('div', class_='g')
urls = []
for result in search_results[:3]:
link = result.find('a')
if link and 'href' in link.attrs:
url = link['href']
if url.startswith('http'):
urls.append(url)
logger.info(f"Found {len(urls)} results for domain: {site_domain}")
return urls[:5]
except requests.RequestException as e:
if retry_count < MAX_RETRIES:
logger.warning(f"Request failed for {site_domain}. Retrying... Error: {str(e)}")
sleep(RETRY_DELAY)
return google_search_scraper(search_text, site_domain, retry_count + 1)
logger.error(f"All retries failed for domain {site_domain}. Error: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"Error scraping {site_domain}: {str(e)}"
)
def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float:
"""Calculate cosine similarity between two embeddings"""
query_array = np.array(query_embedding)
url_array = np.array(url_embedding)
similarity = np.dot(url_array, query_array) / (
np.linalg.norm(url_array) * np.linalg.norm(query_array)
)
return float(similarity)
@scrap_websites_router.post("/search", response_model=SearchResponse)
async def search_websites(request: SearchRequest):
logger.info(f"Starting search with query: {request.search_text}")
logger.info(f"Source types requested: {request.source_types}")
results = {}
error_messages = {}
url_similarities = {}
# Initialize OpenAI client
logger.debug("Initializing OpenAI client")
openai_client = OpenAIClient(OPENAI_API_KEY)
# Get domains based on requested source types
domains = []
for source_type in request.source_types:
if source_type in SOURCES:
domains.extend([source.domain for source in SOURCES[source_type]])
if not domains:
logger.warning("No valid source types provided. Using all available domains.")
domains = [source.domain for source in get_all_sources()]
logger.info(f"Processing {len(domains)} domains")
# Enhance search text with key terms
search_context = request.search_text
logger.debug("Getting query embedding from OpenAI")
query_embedding = openai_client.get_embeddings([search_context])[0]
# Higher similarity threshold for better filtering
SIMILARITY_THRESHOLD = 0.75
for domain in domains:
logger.info(f"Processing domain: {domain}")
try:
urls = google_search_scraper(request.search_text, domain)
url_sims = []
valid_urls = []
logger.debug(f"Found {len(urls)} URLs for domain {domain}")
for url in urls:
url_text = extract_url_text(url)
if not url_text:
logger.debug(f"No meaningful text extracted from URL: {url}")
continue
logger.debug("Getting URL embedding from OpenAI")
url_embedding = openai_client.get_embeddings([url_text])[0]
similarity = calculate_similarity(query_embedding, url_embedding)
logger.debug(f"Similarity score for {url}: {similarity}")
url_sims.append(UrlSimilarityInfo(
url=url,
similarity=similarity,
extracted_text=url_text
))
if similarity >= SIMILARITY_THRESHOLD:
valid_urls.append(url)
results[domain] = valid_urls
url_similarities[domain] = sorted(url_sims,
key=lambda x: x.similarity,
reverse=True)
logger.info(f"Successfully processed domain {domain}. Found {len(valid_urls)} valid URLs")
except HTTPException as e:
logger.error(f"HTTP Exception for domain {domain}: {str(e.detail)}")
error_messages[domain] = str(e.detail)
except Exception as e:
logger.error(f"Unexpected error for domain {domain}: {str(e)}")
error_messages[domain] = f"Unexpected error for {domain}: {str(e)}"
logger.info("Search completed")
logger.debug(f"Results found for {len(results)} domains")
logger.debug(f"Errors encountered for {len(error_messages)} domains")
# Collect all valid URLs from results
all_valid_urls = []
for domain_urls in results.values():
all_valid_urls.extend(domain_urls)
logger.info(f"Total valid URLs collected: {len(all_valid_urls)}")
# Create request body for AI fact check
if all_valid_urls:
fact_check_request = AIFactCheckRequest(
content=request.search_text,
urls=all_valid_urls
)
logger.info("Calling AI fact check service")
try:
ai_response = await ai_fact_check(fact_check_request)
logger.info("AI fact check completed successfully")
# Return response with AI fact check results
return SearchResponse(
results=results,
error_messages=error_messages,
ai_fact_check_result=ai_response
)
except Exception as e:
logger.error(f"Error during AI fact check: {str(e)}")
error_messages["ai_fact_check"] = f"Error during fact checking: {str(e)}"
# Return response without AI fact check if no valid URLs or error occurred
return SearchResponse(
results=results,
error_messages=error_messages,
ai_fact_check_result=None
)