342 lines
No EOL
13 KiB
Python
342 lines
No EOL
13 KiB
Python
from fastapi import APIRouter, HTTPException
|
|
import httpx
|
|
import logging
|
|
from urllib.parse import urlparse
|
|
import json
|
|
from app.services.openai_client import OpenAIClient
|
|
from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID
|
|
from app.websites.fact_checker_website import SOURCES, get_all_sources
|
|
from app.api.ai_fact_check import ai_fact_check
|
|
from typing import List, Dict, Optional
|
|
from pydantic import BaseModel
|
|
from app.models.fact_check_models import (
|
|
AIFactCheckRequest,
|
|
FactCheckSource,
|
|
SourceType
|
|
)
|
|
|
|
# Define Pydantic models
|
|
class Publisher(BaseModel):
|
|
name: str
|
|
site: str
|
|
|
|
class ClaimReview(BaseModel):
|
|
publisher: Publisher
|
|
textualRating: str
|
|
|
|
class Claim(BaseModel):
|
|
claimReview: List[ClaimReview]
|
|
claimant: str
|
|
text: str
|
|
|
|
class Summary(BaseModel):
|
|
fact_checking_sites_queried: int
|
|
total_sources: int
|
|
|
|
class VerificationResult(BaseModel):
|
|
verdict: str
|
|
confidence: str
|
|
evidence: List[str]
|
|
reasoning: str
|
|
fact_check_type: str
|
|
|
|
class SearchRequest(BaseModel):
|
|
search_text: str
|
|
source_types: List[str]
|
|
|
|
class EnhancedFactCheckResponse(BaseModel):
|
|
query: str
|
|
results: List[Dict]
|
|
sources: List
|
|
summary: Summary
|
|
token_usage: Dict[str, int]
|
|
total_claims_found: int
|
|
verification_result: VerificationResult
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO, # Changed back to INFO from DEBUG
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
scrap_websites_router = APIRouter()
|
|
|
|
# Constants
|
|
RESULTS_PER_PAGE = 10
|
|
MAX_PAGES = 5
|
|
MAX_URLS_PER_DOMAIN = 5
|
|
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
|
|
|
def get_domain_from_url(url: str) -> str:
|
|
"""Extract domain from URL with improved handling."""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
# Remove 'www.' if present
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
return domain
|
|
except Exception as e:
|
|
logger.error(f"Error extracting domain from URL {url}: {str(e)}")
|
|
return ""
|
|
|
|
def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
|
|
"""Check if domain matches any source with improved matching logic."""
|
|
if not domain:
|
|
return False
|
|
|
|
domain = domain.lower()
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
|
|
for source in sources:
|
|
source_domain = source.domain.lower()
|
|
if source_domain.startswith('www.'):
|
|
source_domain = source_domain[4:]
|
|
|
|
# Check exact match
|
|
if domain == source_domain:
|
|
logger.debug(f"Exact domain match found: {domain} = {source_domain}")
|
|
return True
|
|
|
|
# Check if domain ends with source domain
|
|
if domain.endswith('.' + source_domain):
|
|
logger.debug(f"Subdomain match found: {domain} ends with {source_domain}")
|
|
return True
|
|
|
|
logger.debug(f"No match found for domain: {domain}")
|
|
return False
|
|
|
|
async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
|
|
"""Build search query with site restrictions."""
|
|
site_queries = [f"site:{source.domain}" for source in sources]
|
|
site_restriction = " OR ".join(site_queries)
|
|
enhanced_query = f"({query}) ({site_restriction})"
|
|
logger.debug(f"Enhanced search query: {enhanced_query}")
|
|
return enhanced_query
|
|
|
|
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
|
|
"""Perform Google Custom Search with enhanced query."""
|
|
enhanced_query = await build_enhanced_search_query(query, sources)
|
|
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
|
|
|
|
params = {
|
|
"key": GOOGLE_API_KEY,
|
|
"cx": GOOGLE_ENGINE_ID,
|
|
"q": enhanced_query,
|
|
"num": RESULTS_PER_PAGE,
|
|
"start": start_index
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
try:
|
|
logger.info(f"Making API request to Google Custom Search with params: {params}")
|
|
response = await client.get(GOOGLE_SEARCH_URL, params=params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
search_info = data.get('searchInformation', {})
|
|
logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, "
|
|
f"Time taken: {search_info.get('searchTime', 0)}s")
|
|
|
|
if 'error' in data:
|
|
error_details = data['error']
|
|
logger.error(f"API Error: {error_details}")
|
|
raise HTTPException(
|
|
status_code=response.status_code,
|
|
detail=f"Google API Error: {error_details.get('message')}"
|
|
)
|
|
|
|
return data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search error: {str(e)}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
|
|
|
|
async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict:
|
|
"""Analyze fact check results using OpenAI to generate a consolidated verdict."""
|
|
|
|
# Extract verification results from sources
|
|
verification_results = []
|
|
for url, result in original_response.get('verification_result', {}).items():
|
|
verification_results.append(f"""
|
|
Source: {url}
|
|
Verdict: {result.get('verdict')}
|
|
Confidence: {result.get('confidence')}
|
|
Evidence: {result.get('evidence')}
|
|
Reasoning: {result.get('reasoning')}
|
|
""")
|
|
|
|
system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results
|
|
and provide a consolidated verdict. Respond with a valid JSON object containing your analysis."""
|
|
|
|
user_prompt = f"""
|
|
Analyze these fact-checking results and provide a final verdict.
|
|
|
|
Query: {original_response.get('query', '')}
|
|
|
|
Fact Check Results:
|
|
{'\n'.join(verification_results)}"""
|
|
|
|
try:
|
|
logger.info("Generating AI analysis of fact check results")
|
|
response = await openai_client.generate_text_response(
|
|
system_prompt=system_prompt,
|
|
user_prompt=user_prompt,
|
|
max_tokens=2000
|
|
)
|
|
|
|
# Create the enhanced result structure
|
|
enhanced_result = {
|
|
"query": original_response.get('query', ''),
|
|
"results": [
|
|
{
|
|
"claimReview": [
|
|
{
|
|
"publisher": {
|
|
"name": source,
|
|
"site": source
|
|
},
|
|
"textualRating": result.get('verdict', '')
|
|
} for source in original_response.get('sources', [])
|
|
],
|
|
"claimant": "source",
|
|
"text": original_response.get('query', '')
|
|
}
|
|
],
|
|
"sources": original_response.get('sources', []),
|
|
"summary": {
|
|
"fact_checking_sites_queried": len(original_response.get('sources', [])),
|
|
"total_sources": len(original_response.get('verification_result', {}))
|
|
},
|
|
"verification_result": {
|
|
"verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''),
|
|
"confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''),
|
|
"evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')],
|
|
"reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''),
|
|
"fact_check_type": "ai fact checker"
|
|
},
|
|
"token_usage": original_response.get('token_usage', {
|
|
"prompt_tokens": 0,
|
|
"completion_tokens": 0,
|
|
"total_tokens": 0
|
|
})
|
|
}
|
|
|
|
enhanced_result["total_claims_found"] = len(enhanced_result.get("results", []))
|
|
|
|
logger.info("Successfully generated AI analysis")
|
|
return enhanced_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in OpenAI analysis: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}")
|
|
@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse)
|
|
async def search_websites(request: SearchRequest):
|
|
logger.info(f"Starting search with query: {request.search_text}")
|
|
logger.info(f"Source types requested: {request.source_types}")
|
|
|
|
# Get sources for requested types
|
|
selected_sources = []
|
|
for source_type in request.source_types:
|
|
if source_type in SOURCES:
|
|
selected_sources.extend(SOURCES[source_type])
|
|
|
|
if not selected_sources:
|
|
logger.warning("No valid source types provided. Using all available sources.")
|
|
selected_sources = get_all_sources()
|
|
|
|
logger.info(f"Selected sources: {[source.domain for source in selected_sources]}")
|
|
|
|
# Initialize collections for URLs
|
|
all_urls = []
|
|
domain_results = {}
|
|
|
|
try:
|
|
# Search and collect URLs
|
|
for page in range(1, MAX_PAGES + 1):
|
|
if len(all_urls) >= 50:
|
|
logger.info("Reached maximum URL limit of 50")
|
|
break
|
|
|
|
logger.info(f"Fetching page {page} of search results")
|
|
search_response = await google_custom_search(request.search_text, selected_sources, page)
|
|
|
|
if not search_response or not search_response.get("items"):
|
|
logger.warning(f"No results found on page {page}")
|
|
break
|
|
|
|
for item in search_response.get("items", []):
|
|
url = item.get("link")
|
|
if not url:
|
|
continue
|
|
|
|
domain = get_domain_from_url(url)
|
|
logger.debug(f"Processing URL: {url} with domain: {domain}")
|
|
|
|
if is_valid_source_domain(domain, selected_sources):
|
|
if domain not in domain_results:
|
|
domain_results[domain] = []
|
|
|
|
if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
|
|
domain_results[domain].append({
|
|
"url": url,
|
|
"title": item.get("title", ""),
|
|
"snippet": item.get("snippet", "")
|
|
})
|
|
all_urls.append(url)
|
|
else:
|
|
logger.debug(f"Skipping URL {url} - domain not in allowed list")
|
|
|
|
if len(all_urls) >= 50:
|
|
break
|
|
|
|
logger.info(f"Total URLs collected: {len(all_urls)}")
|
|
|
|
if not all_urls:
|
|
return EnhancedFactCheckResponse(
|
|
query=request.search_text,
|
|
results=[],
|
|
sources=[],
|
|
summary=Summary(
|
|
fact_checking_sites_queried=len(selected_sources),
|
|
total_sources=0
|
|
),
|
|
token_usage={
|
|
"prompt_tokens": 0,
|
|
"completion_tokens": 0,
|
|
"total_tokens": 0
|
|
},
|
|
total_claims_found=0,
|
|
verification_result=VerificationResult(
|
|
verdict="Insufficient Evidence",
|
|
confidence="Low",
|
|
evidence=["No relevant sources found"],
|
|
reasoning="No fact-checking sources were found for this claim",
|
|
fact_check_type="ai fact checker"
|
|
)
|
|
)
|
|
|
|
# Perform fact check with collected URLs
|
|
fact_check_request = AIFactCheckRequest(
|
|
content=request.search_text,
|
|
urls=all_urls[:5] # Limit to 5 URLs
|
|
)
|
|
|
|
logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs")
|
|
fact_check_response = await ai_fact_check(fact_check_request)
|
|
|
|
# Get enhanced analysis
|
|
openai_client = OpenAIClient(OPENAI_API_KEY)
|
|
enhanced_response = await analyze_fact_check_results(
|
|
openai_client,
|
|
fact_check_response.dict()
|
|
)
|
|
|
|
return EnhancedFactCheckResponse(**enhanced_response)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e)) |