Dev #1
7 changed files with 390 additions and 311 deletions
Binary file not shown.
Binary file not shown.
|
|
@ -2,7 +2,7 @@ from fastapi import APIRouter, HTTPException
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
import httpx
|
||||||
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
|
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
|
||||||
from app.models.fact_check_models import (
|
from app.models.fact_check_models import (
|
||||||
GoogleFactCheckRequest as FactCheckRequest,
|
GoogleFactCheckRequest as FactCheckRequest,
|
||||||
|
|
@ -12,7 +12,6 @@ from app.models.fact_check_models import (
|
||||||
TokenUsage
|
TokenUsage
|
||||||
)
|
)
|
||||||
from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources
|
from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources
|
||||||
from app.api.scrap_websites import SearchRequest, search_websites
|
|
||||||
|
|
||||||
fact_check_router = APIRouter()
|
fact_check_router = APIRouter()
|
||||||
|
|
||||||
|
|
@ -22,6 +21,39 @@ class CustomJSONEncoder(json.JSONEncoder):
|
||||||
return obj.isoformat()
|
return obj.isoformat()
|
||||||
return super().default(obj)
|
return super().default(obj)
|
||||||
|
|
||||||
|
async def validate_api_key():
|
||||||
|
"""Validate the Google API key with a test request"""
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
try:
|
||||||
|
test_url = f"{GOOGLE_FACT_CHECK_BASE_URL}claims:search"
|
||||||
|
params = {
|
||||||
|
"key": GOOGLE_API_KEY,
|
||||||
|
"query": "test",
|
||||||
|
"languageCode": "en-US",
|
||||||
|
"pageSize": 1
|
||||||
|
}
|
||||||
|
response = await client.get(test_url, params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
return True
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
if e.response.status_code == 403:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail=ErrorResponse(
|
||||||
|
detail="Invalid or expired API key",
|
||||||
|
error_code="INVALID_API_KEY",
|
||||||
|
path="/check-facts"
|
||||||
|
).dict()
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail=ErrorResponse(
|
||||||
|
detail=f"API validation failed: {str(e)}",
|
||||||
|
error_code="API_VALIDATION_ERROR",
|
||||||
|
path="/check-facts"
|
||||||
|
).dict()
|
||||||
|
)
|
||||||
|
|
||||||
@fact_check_router.post(
|
@fact_check_router.post(
|
||||||
"/check-facts",
|
"/check-facts",
|
||||||
response_model=FactCheckResponse,
|
response_model=FactCheckResponse,
|
||||||
|
|
@ -34,7 +66,7 @@ class CustomJSONEncoder(json.JSONEncoder):
|
||||||
)
|
)
|
||||||
async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
||||||
"""
|
"""
|
||||||
Check facts using multiple fact-checking sources and fallback to web search
|
Check facts using multiple fact-checking sources
|
||||||
"""
|
"""
|
||||||
all_results = []
|
all_results = []
|
||||||
verified_results = []
|
verified_results = []
|
||||||
|
|
@ -50,10 +82,14 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
||||||
).dict()
|
).dict()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Validate API key before proceeding
|
||||||
|
await validate_api_key()
|
||||||
|
|
||||||
# Get all sources in priority order
|
# Get all sources in priority order
|
||||||
all_sources = get_all_sources()
|
all_sources = get_all_sources()
|
||||||
all_sources_list = [] # To store source URLs
|
all_sources_list = [] # To store source URLs
|
||||||
contexts_used = [] # To store context snippets
|
contexts_used = [] # To store context snippets
|
||||||
|
failed_sources = [] # Track failed sources
|
||||||
|
|
||||||
for source in all_sources:
|
for source in all_sources:
|
||||||
try:
|
try:
|
||||||
|
|
@ -78,75 +114,39 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
||||||
if "textualRating" in review:
|
if "textualRating" in review:
|
||||||
contexts_used.append(review["textualRating"])
|
contexts_used.append(review["textualRating"])
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException as http_err:
|
||||||
|
failed_sources.append({
|
||||||
|
"source": source.domain,
|
||||||
|
"error": str(http_err.detail)
|
||||||
|
})
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log the error but continue with other sources
|
failed_sources.append({
|
||||||
print(f"Error processing {source.domain}: {str(e)}")
|
"source": source.domain,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If no results found, try searching websites
|
# Return partial results if some sources failed but we have data
|
||||||
if not all_results:
|
if all_results:
|
||||||
try:
|
verification_result = {
|
||||||
# Create search request
|
"verdict": "Partial Results Available" if failed_sources else "Complete Results",
|
||||||
search_request = SearchRequest(
|
"confidence": "Medium" if failed_sources else "High",
|
||||||
search_text=request.content,
|
"evidence": contexts_used,
|
||||||
source_types=["fact_checkers"]
|
"reasoning": "Based on available fact checks",
|
||||||
)
|
"missing_info": f"{len(failed_sources)} sources failed" if failed_sources else None
|
||||||
|
|
||||||
# Perform website search
|
|
||||||
search_response = await search_websites(search_request)
|
|
||||||
|
|
||||||
# If AI fact check results are available, use them
|
|
||||||
if search_response.ai_fact_check_result:
|
|
||||||
# Create a claim from AI fact check result
|
|
||||||
ai_claim = {
|
|
||||||
"text": request.content,
|
|
||||||
"claimant": "AI Analysis",
|
|
||||||
"claimDate": datetime.now().isoformat(),
|
|
||||||
"claimReview": [{
|
|
||||||
"publisher": {
|
|
||||||
"name": "AI Fact Checker",
|
|
||||||
"site": "ai-fact-check"
|
|
||||||
},
|
|
||||||
"textualRating": search_response.ai_fact_check_result.verification_result["verdict"],
|
|
||||||
"title": "AI Fact Check Analysis",
|
|
||||||
"reviewDate": datetime.now().isoformat(),
|
|
||||||
"url": ""
|
|
||||||
}]
|
|
||||||
}
|
}
|
||||||
|
else:
|
||||||
validated_claim = Claim(**ai_claim).dict()
|
|
||||||
all_results.append(validated_claim)
|
|
||||||
|
|
||||||
# Add sources and contexts
|
|
||||||
all_sources_list.extend(search_response.results.keys())
|
|
||||||
if search_response.ai_fact_check_result.verification_result["evidence"]:
|
|
||||||
contexts_used.extend(search_response.ai_fact_check_result.verification_result["evidence"])
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during website search: {str(e)}")
|
|
||||||
|
|
||||||
# If still no results found after searching websites
|
|
||||||
if not all_results:
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=404,
|
status_code=404,
|
||||||
detail=ErrorResponse(
|
detail=ErrorResponse(
|
||||||
detail="No fact check results found",
|
detail="No fact check results found. Failed sources: " +
|
||||||
|
", ".join([f"{f['source']}: {f['error']}" for f in failed_sources]),
|
||||||
error_code="NO_RESULTS_FOUND",
|
error_code="NO_RESULTS_FOUND",
|
||||||
path="/check-facts"
|
path="/check-facts"
|
||||||
).dict()
|
).dict()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Prepare the verification result
|
|
||||||
verification_result = {
|
|
||||||
"verdict": "Insufficient Information", # Default verdict
|
|
||||||
"confidence": "Low",
|
|
||||||
"evidence": contexts_used,
|
|
||||||
"reasoning": "Based on available fact checks and web search results",
|
|
||||||
"missing_info": "Additional verification may be needed"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create token usage information
|
# Create token usage information
|
||||||
token_usage = TokenUsage(
|
token_usage = TokenUsage(
|
||||||
prompt_tokens=0,
|
prompt_tokens=0,
|
||||||
|
|
@ -161,10 +161,12 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
||||||
results=all_results,
|
results=all_results,
|
||||||
verification_result=verification_result,
|
verification_result=verification_result,
|
||||||
sources=list(set(all_sources_list)),
|
sources=list(set(all_sources_list)),
|
||||||
|
context_used=contexts_used,
|
||||||
token_usage=token_usage,
|
token_usage=token_usage,
|
||||||
summary={
|
summary={
|
||||||
"total_sources": len(set(all_sources_list)),
|
"total_sources": len(set(all_sources_list)),
|
||||||
"fact_checking_sites_queried": len(all_sources)
|
"fact_checking_sites_queried": len(all_sources),
|
||||||
|
"failed_sources": failed_sources
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,309 +1,342 @@
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
from pydantic import BaseModel
|
import httpx
|
||||||
from typing import List, Dict, Optional
|
|
||||||
from urllib.parse import urlencode, urlparse
|
|
||||||
import urllib.parse
|
|
||||||
import numpy as np
|
|
||||||
from time import sleep
|
|
||||||
import logging
|
import logging
|
||||||
import requests
|
from urllib.parse import urlparse
|
||||||
from bs4 import BeautifulSoup
|
import json
|
||||||
import re
|
|
||||||
from app.services.openai_client import OpenAIClient
|
from app.services.openai_client import OpenAIClient
|
||||||
from app.config import OPENAI_API_KEY
|
from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID
|
||||||
from app.websites.fact_checker_website import SOURCES, get_all_sources
|
from app.websites.fact_checker_website import SOURCES, get_all_sources
|
||||||
from app.api.ai_fact_check import ai_fact_check
|
from app.api.ai_fact_check import ai_fact_check
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
from app.models.fact_check_models import (
|
from app.models.fact_check_models import (
|
||||||
AIFactCheckRequest,
|
AIFactCheckRequest,
|
||||||
AIFactCheckResponse,
|
FactCheckSource,
|
||||||
VerificationResult,
|
SourceType
|
||||||
TokenUsage
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Define Pydantic models
|
||||||
|
class Publisher(BaseModel):
|
||||||
|
name: str
|
||||||
|
site: str
|
||||||
|
|
||||||
|
class ClaimReview(BaseModel):
|
||||||
|
publisher: Publisher
|
||||||
|
textualRating: str
|
||||||
|
|
||||||
|
class Claim(BaseModel):
|
||||||
|
claimReview: List[ClaimReview]
|
||||||
|
claimant: str
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class Summary(BaseModel):
|
||||||
|
fact_checking_sites_queried: int
|
||||||
|
total_sources: int
|
||||||
|
|
||||||
|
class VerificationResult(BaseModel):
|
||||||
|
verdict: str
|
||||||
|
confidence: str
|
||||||
|
evidence: List[str]
|
||||||
|
reasoning: str
|
||||||
|
fact_check_type: str
|
||||||
|
|
||||||
|
class SearchRequest(BaseModel):
|
||||||
|
search_text: str
|
||||||
|
source_types: List[str]
|
||||||
|
|
||||||
|
class EnhancedFactCheckResponse(BaseModel):
|
||||||
|
query: str
|
||||||
|
results: List[Dict]
|
||||||
|
sources: List
|
||||||
|
summary: Summary
|
||||||
|
token_usage: Dict[str, int]
|
||||||
|
total_claims_found: int
|
||||||
|
verification_result: VerificationResult
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO, # Changed back to INFO from DEBUG
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
scrap_websites_router = APIRouter()
|
scrap_websites_router = APIRouter()
|
||||||
|
|
||||||
# Configuration for scraping
|
# Constants
|
||||||
MAX_RETRIES = 2
|
RESULTS_PER_PAGE = 10
|
||||||
RETRY_DELAY = 2
|
MAX_PAGES = 5
|
||||||
|
MAX_URLS_PER_DOMAIN = 5
|
||||||
|
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
|
||||||
class SearchRequest(BaseModel):
|
def get_domain_from_url(url: str) -> str:
|
||||||
search_text: str
|
"""Extract domain from URL with improved handling."""
|
||||||
source_types: List[str] = ["fact_checkers"]
|
|
||||||
|
|
||||||
class UrlSimilarityInfo(BaseModel):
|
|
||||||
url: str
|
|
||||||
similarity: float
|
|
||||||
extracted_text: str
|
|
||||||
|
|
||||||
class SearchResponse(BaseModel):
|
|
||||||
results: Dict[str, List[str]]
|
|
||||||
error_messages: Dict[str, str]
|
|
||||||
ai_fact_check_result: Optional[Dict] = None
|
|
||||||
|
|
||||||
def extract_url_text(url: str) -> str:
|
|
||||||
"""Extract and process meaningful text from URL path with improved cleaning"""
|
|
||||||
logger.debug(f"Extracting text from URL: {url}")
|
|
||||||
try:
|
try:
|
||||||
parsed = urllib.parse.urlparse(url)
|
parsed = urlparse(url)
|
||||||
path = parsed.path
|
domain = parsed.netloc.lower()
|
||||||
path = path.replace('.html', '').replace('/index', '').replace('.php', '')
|
# Remove 'www.' if present
|
||||||
segments = [seg for seg in path.split('/') if seg]
|
if domain.startswith('www.'):
|
||||||
cleaned_segments = []
|
domain = domain[4:]
|
||||||
for segment in segments:
|
return domain
|
||||||
segment = segment.replace('-', ' ').replace('_', ' ')
|
except Exception as e:
|
||||||
if not (segment.replace(' ', '').isdigit() or
|
logger.error(f"Error extracting domain from URL {url}: {str(e)}")
|
||||||
all(part.isdigit() for part in segment.split() if part)):
|
return ""
|
||||||
cleaned_segments.append(segment)
|
|
||||||
|
|
||||||
common_words = {
|
def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
|
||||||
'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk',
|
"""Check if domain matches any source with improved matching logic."""
|
||||||
'updates', 'update', 'latest', 'breaking', 'new', 'article'
|
if not domain:
|
||||||
|
return False
|
||||||
|
|
||||||
|
domain = domain.lower()
|
||||||
|
if domain.startswith('www.'):
|
||||||
|
domain = domain[4:]
|
||||||
|
|
||||||
|
for source in sources:
|
||||||
|
source_domain = source.domain.lower()
|
||||||
|
if source_domain.startswith('www.'):
|
||||||
|
source_domain = source_domain[4:]
|
||||||
|
|
||||||
|
# Check exact match
|
||||||
|
if domain == source_domain:
|
||||||
|
logger.debug(f"Exact domain match found: {domain} = {source_domain}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if domain ends with source domain
|
||||||
|
if domain.endswith('.' + source_domain):
|
||||||
|
logger.debug(f"Subdomain match found: {domain} ends with {source_domain}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
logger.debug(f"No match found for domain: {domain}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
|
||||||
|
"""Build search query with site restrictions."""
|
||||||
|
site_queries = [f"site:{source.domain}" for source in sources]
|
||||||
|
site_restriction = " OR ".join(site_queries)
|
||||||
|
enhanced_query = f"({query}) ({site_restriction})"
|
||||||
|
logger.debug(f"Enhanced search query: {enhanced_query}")
|
||||||
|
return enhanced_query
|
||||||
|
|
||||||
|
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
|
||||||
|
"""Perform Google Custom Search with enhanced query."""
|
||||||
|
enhanced_query = await build_enhanced_search_query(query, sources)
|
||||||
|
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"key": GOOGLE_API_KEY,
|
||||||
|
"cx": GOOGLE_ENGINE_ID,
|
||||||
|
"q": enhanced_query,
|
||||||
|
"num": RESULTS_PER_PAGE,
|
||||||
|
"start": start_index
|
||||||
}
|
}
|
||||||
|
|
||||||
text = ' '.join(cleaned_segments)
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
words = [word.lower() for word in text.split()
|
|
||||||
if word.lower() not in common_words and len(word) > 1]
|
|
||||||
|
|
||||||
result = ' '.join(words)
|
|
||||||
logger.debug(f"Extracted text: {result}")
|
|
||||||
return result
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error extracting text from URL {url}: {str(e)}")
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def extract_search_results(html_content):
|
|
||||||
"""Extract URLs using multiple selectors and patterns"""
|
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
|
||||||
urls = set() # Using set to avoid duplicates
|
|
||||||
|
|
||||||
# Multiple CSS selectors to try
|
|
||||||
selectors = [
|
|
||||||
'div.g div.yuRUbf > a', # Main result links
|
|
||||||
'div.g a.l', # Alternative link format
|
|
||||||
'div.rc a', # Another possible format
|
|
||||||
'div[class*="g"] > div > div > div > a', # Broader match
|
|
||||||
'a[href^="http"]' # Any http link
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in selectors:
|
|
||||||
try:
|
try:
|
||||||
elements = soup.select(selector)
|
logger.info(f"Making API request to Google Custom Search with params: {params}")
|
||||||
for element in elements:
|
response = await client.get(GOOGLE_SEARCH_URL, params=params)
|
||||||
url = element.get('href')
|
response.raise_for_status()
|
||||||
if url and url.startswith('http') and not url.startswith('https://www.google.com'):
|
|
||||||
urls.add(url)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with selector {selector}: {str(e)}")
|
|
||||||
|
|
||||||
# Also try finding URLs in the raw HTML using regex
|
data = response.json()
|
||||||
url_pattern = r'href="(https?://[^"]+)"'
|
|
||||||
raw_urls = re.findall(url_pattern, html_content)
|
|
||||||
for url in raw_urls:
|
|
||||||
if not url.startswith('https://www.google.com'):
|
|
||||||
urls.add(url)
|
|
||||||
|
|
||||||
return list(urls)
|
search_info = data.get('searchInformation', {})
|
||||||
|
logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, "
|
||||||
|
f"Time taken: {search_info.get('searchTime', 0)}s")
|
||||||
|
|
||||||
def google_search_scraper(search_text: str, site_domain: str, retry_count: int = 0) -> List[str]:
|
if 'error' in data:
|
||||||
"""Scrape Google search results with multiple query formats"""
|
error_details = data['error']
|
||||||
logger.info(f"Searching for '{search_text}' on domain: {site_domain}")
|
logger.error(f"API Error: {error_details}")
|
||||||
|
raise HTTPException(
|
||||||
headers = {
|
status_code=response.status_code,
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
detail=f"Google API Error: {error_details.get('message')}"
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.5',
|
|
||||||
'Referer': 'https://www.google.com/',
|
|
||||||
'DNT': '1'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Try different query formats
|
|
||||||
query_formats = [
|
|
||||||
f"{search_text} site:{site_domain}",
|
|
||||||
f"site:{site_domain} {search_text}",
|
|
||||||
f"\"{search_text}\" site:{site_domain}"
|
|
||||||
]
|
|
||||||
|
|
||||||
all_urls = set()
|
|
||||||
|
|
||||||
for query in query_formats:
|
|
||||||
try:
|
|
||||||
google_url = f"https://www.google.com/search?q={urlencode({'q': query})}"
|
|
||||||
logger.debug(f"Trying query format: {query}")
|
|
||||||
|
|
||||||
response = requests.get(google_url, headers=headers)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
urls = extract_search_results(response.text)
|
|
||||||
domain_urls = [url for url in urls if site_domain in urlparse(url).netloc]
|
|
||||||
all_urls.update(domain_urls)
|
|
||||||
else:
|
|
||||||
logger.warning(f"Received status code {response.status_code} for query format: {query}")
|
|
||||||
|
|
||||||
sleep(2) # Delay between requests
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing query format '{query}': {str(e)}")
|
|
||||||
if retry_count < MAX_RETRIES:
|
|
||||||
sleep(RETRY_DELAY)
|
|
||||||
return google_search_scraper(search_text, site_domain, retry_count + 1)
|
|
||||||
|
|
||||||
valid_urls = list(all_urls)
|
|
||||||
logger.info(f"Found {len(valid_urls)} unique URLs for domain: {site_domain}")
|
|
||||||
return valid_urls[:5] # Return up to 5 URLs
|
|
||||||
|
|
||||||
def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float:
|
|
||||||
"""Calculate cosine similarity between two embeddings"""
|
|
||||||
query_array = np.array(query_embedding)
|
|
||||||
url_array = np.array(url_embedding)
|
|
||||||
|
|
||||||
similarity = np.dot(url_array, query_array) / (
|
|
||||||
np.linalg.norm(url_array) * np.linalg.norm(query_array)
|
|
||||||
)
|
)
|
||||||
return float(similarity)
|
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
@scrap_websites_router.post("/search", response_model=SearchResponse)
|
except Exception as e:
|
||||||
|
logger.error(f"Search error: {str(e)}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
|
||||||
|
|
||||||
|
async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict:
|
||||||
|
"""Analyze fact check results using OpenAI to generate a consolidated verdict."""
|
||||||
|
|
||||||
|
# Extract verification results from sources
|
||||||
|
verification_results = []
|
||||||
|
for url, result in original_response.get('verification_result', {}).items():
|
||||||
|
verification_results.append(f"""
|
||||||
|
Source: {url}
|
||||||
|
Verdict: {result.get('verdict')}
|
||||||
|
Confidence: {result.get('confidence')}
|
||||||
|
Evidence: {result.get('evidence')}
|
||||||
|
Reasoning: {result.get('reasoning')}
|
||||||
|
""")
|
||||||
|
|
||||||
|
system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results
|
||||||
|
and provide a consolidated verdict. Respond with a valid JSON object containing your analysis."""
|
||||||
|
|
||||||
|
user_prompt = f"""
|
||||||
|
Analyze these fact-checking results and provide a final verdict.
|
||||||
|
|
||||||
|
Query: {original_response.get('query', '')}
|
||||||
|
|
||||||
|
Fact Check Results:
|
||||||
|
{'\n'.join(verification_results)}"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info("Generating AI analysis of fact check results")
|
||||||
|
response = await openai_client.generate_text_response(
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
user_prompt=user_prompt,
|
||||||
|
max_tokens=2000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the enhanced result structure
|
||||||
|
enhanced_result = {
|
||||||
|
"query": original_response.get('query', ''),
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"claimReview": [
|
||||||
|
{
|
||||||
|
"publisher": {
|
||||||
|
"name": source,
|
||||||
|
"site": source
|
||||||
|
},
|
||||||
|
"textualRating": result.get('verdict', '')
|
||||||
|
} for source in original_response.get('sources', [])
|
||||||
|
],
|
||||||
|
"claimant": "source",
|
||||||
|
"text": original_response.get('query', '')
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sources": original_response.get('sources', []),
|
||||||
|
"summary": {
|
||||||
|
"fact_checking_sites_queried": len(original_response.get('sources', [])),
|
||||||
|
"total_sources": len(original_response.get('verification_result', {}))
|
||||||
|
},
|
||||||
|
"verification_result": {
|
||||||
|
"verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''),
|
||||||
|
"confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''),
|
||||||
|
"evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')],
|
||||||
|
"reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''),
|
||||||
|
"fact_check_type": "ai fact checker"
|
||||||
|
},
|
||||||
|
"token_usage": original_response.get('token_usage', {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"completion_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
enhanced_result["total_claims_found"] = len(enhanced_result.get("results", []))
|
||||||
|
|
||||||
|
logger.info("Successfully generated AI analysis")
|
||||||
|
return enhanced_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in OpenAI analysis: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}")
|
||||||
|
@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse)
|
||||||
async def search_websites(request: SearchRequest):
|
async def search_websites(request: SearchRequest):
|
||||||
logger.info(f"Starting search with query: {request.search_text}")
|
logger.info(f"Starting search with query: {request.search_text}")
|
||||||
logger.info(f"Source types requested: {request.source_types}")
|
logger.info(f"Source types requested: {request.source_types}")
|
||||||
|
|
||||||
results = {}
|
# Get sources for requested types
|
||||||
error_messages = {}
|
selected_sources = []
|
||||||
|
|
||||||
# Initialize OpenAI client
|
|
||||||
logger.debug("Initializing OpenAI client")
|
|
||||||
openai_client = OpenAIClient(OPENAI_API_KEY)
|
|
||||||
|
|
||||||
# Get domains based on requested source types
|
|
||||||
domains = []
|
|
||||||
for source_type in request.source_types:
|
for source_type in request.source_types:
|
||||||
if source_type in SOURCES:
|
if source_type in SOURCES:
|
||||||
domains.extend([source.domain for source in SOURCES[source_type]])
|
selected_sources.extend(SOURCES[source_type])
|
||||||
|
|
||||||
if not domains:
|
if not selected_sources:
|
||||||
logger.warning("No valid source types provided. Using all available domains.")
|
logger.warning("No valid source types provided. Using all available sources.")
|
||||||
domains = [source.domain for source in get_all_sources()]
|
selected_sources = get_all_sources()
|
||||||
|
|
||||||
logger.info(f"Processing {len(domains)} domains")
|
logger.info(f"Selected sources: {[source.domain for source in selected_sources]}")
|
||||||
|
|
||||||
# Enhance search text with key terms
|
# Initialize collections for URLs
|
||||||
search_context = request.search_text
|
all_urls = []
|
||||||
logger.debug("Getting query embedding from OpenAI")
|
domain_results = {}
|
||||||
query_embedding = openai_client.get_embeddings([search_context])[0]
|
|
||||||
|
|
||||||
# Higher similarity threshold for better filtering
|
|
||||||
SIMILARITY_THRESHOLD = 0.75
|
|
||||||
MAX_URLS_PER_DOMAIN = 2 # Adjusted to ensure total stays under 5
|
|
||||||
TOTAL_MAX_URLS = 5 # Maximum URLs allowed for AIFactCheckRequest
|
|
||||||
|
|
||||||
total_urls_collected = 0
|
|
||||||
for domain in domains[:3]: # Limit to 3 domains for testing
|
|
||||||
if total_urls_collected >= TOTAL_MAX_URLS:
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.info(f"Processing domain: {domain}")
|
|
||||||
try:
|
try:
|
||||||
urls = google_search_scraper(request.search_text, domain)
|
# Search and collect URLs
|
||||||
valid_urls = []
|
for page in range(1, MAX_PAGES + 1):
|
||||||
|
if len(all_urls) >= 50:
|
||||||
logger.debug(f"Found {len(urls)} URLs for domain {domain}")
|
logger.info("Reached maximum URL limit of 50")
|
||||||
|
|
||||||
for url in urls:
|
|
||||||
if len(valid_urls) >= MAX_URLS_PER_DOMAIN or total_urls_collected >= TOTAL_MAX_URLS:
|
|
||||||
break
|
break
|
||||||
|
|
||||||
url_text = extract_url_text(url)
|
logger.info(f"Fetching page {page} of search results")
|
||||||
|
search_response = await google_custom_search(request.search_text, selected_sources, page)
|
||||||
|
|
||||||
if not url_text:
|
if not search_response or not search_response.get("items"):
|
||||||
logger.debug(f"No meaningful text extracted from URL: {url}")
|
logger.warning(f"No results found on page {page}")
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in search_response.get("items", []):
|
||||||
|
url = item.get("link")
|
||||||
|
if not url:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug("Getting URL embedding from OpenAI")
|
domain = get_domain_from_url(url)
|
||||||
url_embedding = openai_client.get_embeddings([url_text])[0]
|
logger.debug(f"Processing URL: {url} with domain: {domain}")
|
||||||
similarity = calculate_similarity(query_embedding, url_embedding)
|
|
||||||
|
|
||||||
logger.debug(f"Similarity score for {url}: {similarity}")
|
if is_valid_source_domain(domain, selected_sources):
|
||||||
|
if domain not in domain_results:
|
||||||
|
domain_results[domain] = []
|
||||||
|
|
||||||
if similarity >= SIMILARITY_THRESHOLD:
|
if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
|
||||||
valid_urls.append(url)
|
domain_results[domain].append({
|
||||||
total_urls_collected += 1
|
"url": url,
|
||||||
|
"title": item.get("title", ""),
|
||||||
|
"snippet": item.get("snippet", "")
|
||||||
|
})
|
||||||
|
all_urls.append(url)
|
||||||
|
else:
|
||||||
|
logger.debug(f"Skipping URL {url} - domain not in allowed list")
|
||||||
|
|
||||||
results[domain] = valid_urls
|
if len(all_urls) >= 50:
|
||||||
logger.info(f"Successfully processed domain {domain}. Found {len(valid_urls)} valid URLs")
|
break
|
||||||
|
|
||||||
except HTTPException as e:
|
logger.info(f"Total URLs collected: {len(all_urls)}")
|
||||||
logger.error(f"HTTP Exception for domain {domain}: {str(e.detail)}")
|
|
||||||
error_messages[domain] = str(e.detail)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error for domain {domain}: {str(e)}")
|
|
||||||
error_messages[domain] = f"Unexpected error for {domain}: {str(e)}"
|
|
||||||
|
|
||||||
sleep(1) # Add delay between processing different domains
|
if not all_urls:
|
||||||
|
return EnhancedFactCheckResponse(
|
||||||
|
query=request.search_text,
|
||||||
|
results=[],
|
||||||
|
sources=[],
|
||||||
|
summary=Summary(
|
||||||
|
fact_checking_sites_queried=len(selected_sources),
|
||||||
|
total_sources=0
|
||||||
|
),
|
||||||
|
token_usage={
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"completion_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
},
|
||||||
|
total_claims_found=0,
|
||||||
|
verification_result=VerificationResult(
|
||||||
|
verdict="Insufficient Evidence",
|
||||||
|
confidence="Low",
|
||||||
|
evidence=["No relevant sources found"],
|
||||||
|
reasoning="No fact-checking sources were found for this claim",
|
||||||
|
fact_check_type="ai fact checker"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Search completed")
|
# Perform fact check with collected URLs
|
||||||
logger.debug(f"Results found for {len(results)} domains")
|
|
||||||
logger.debug(f"Errors encountered for {len(error_messages)} domains")
|
|
||||||
|
|
||||||
# Collect all valid URLs from results
|
|
||||||
all_valid_urls = []
|
|
||||||
for domain_urls in results.values():
|
|
||||||
all_valid_urls.extend(domain_urls)
|
|
||||||
|
|
||||||
logger.info(f"Total valid URLs collected: {len(all_valid_urls)}")
|
|
||||||
|
|
||||||
# Create request body for AI fact check
|
|
||||||
if all_valid_urls:
|
|
||||||
fact_check_request = AIFactCheckRequest(
|
fact_check_request = AIFactCheckRequest(
|
||||||
content=request.search_text,
|
content=request.search_text,
|
||||||
urls=all_valid_urls[:TOTAL_MAX_URLS] # Ensure we don't exceed the limit
|
urls=all_urls[:5] # Limit to 5 URLs
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Calling AI fact check service")
|
logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs")
|
||||||
try:
|
fact_check_response = await ai_fact_check(fact_check_request)
|
||||||
ai_response = await ai_fact_check(fact_check_request)
|
|
||||||
logger.info("AI fact check completed successfully")
|
|
||||||
|
|
||||||
# Format AI fact check response
|
# Get enhanced analysis
|
||||||
formatted_response = {
|
openai_client = OpenAIClient(OPENAI_API_KEY)
|
||||||
"query": ai_response.query,
|
enhanced_response = await analyze_fact_check_results(
|
||||||
"token_usage": {
|
openai_client,
|
||||||
"prompt_tokens": ai_response.token_usage.prompt_tokens,
|
fact_check_response.dict()
|
||||||
"completion_tokens": ai_response.token_usage.completion_tokens,
|
|
||||||
"total_tokens": ai_response.token_usage.total_tokens
|
|
||||||
},
|
|
||||||
"sources": ai_response.sources,
|
|
||||||
"verification_result": {
|
|
||||||
url: {
|
|
||||||
"verdict": result.verdict,
|
|
||||||
"confidence": result.confidence,
|
|
||||||
"evidence": result.evidence,
|
|
||||||
"reasoning": result.reasoning,
|
|
||||||
"missing_info": result.missing_info
|
|
||||||
} for url, result in ai_response.verification_result.items()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Return response with AI fact check results
|
|
||||||
return SearchResponse(
|
|
||||||
results=results,
|
|
||||||
error_messages=error_messages,
|
|
||||||
ai_fact_check_result=formatted_response
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return EnhancedFactCheckResponse(**enhanced_response)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during AI fact check: {str(e)}")
|
logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True)
|
||||||
error_messages["ai_fact_check"] = f"Error during fact checking: {str(e)}"
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
# Return response without AI fact check if no valid URLs or error occurred
|
|
||||||
return SearchResponse(
|
|
||||||
results=results,
|
|
||||||
error_messages=error_messages,
|
|
||||||
ai_fact_check_result=None
|
|
||||||
)
|
|
||||||
|
|
@ -5,6 +5,7 @@ load_dotenv()
|
||||||
|
|
||||||
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
|
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
|
||||||
GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
|
GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
|
||||||
|
GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
|
||||||
|
|
||||||
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
||||||
FRONTEND_URL = os.environ["FRONTEND_URL"]
|
FRONTEND_URL = os.environ["FRONTEND_URL"]
|
||||||
Binary file not shown.
43
app/models/scrap_websites_models.py
Normal file
43
app/models/scrap_websites_models.py
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
class SearchRequest(BaseModel):
|
||||||
|
search_text: str
|
||||||
|
source_types: List[str] = ["fact_checkers"]
|
||||||
|
|
||||||
|
class Publisher(BaseModel):
|
||||||
|
name: str
|
||||||
|
site: str
|
||||||
|
|
||||||
|
class ClaimReview(BaseModel):
|
||||||
|
publisher: Publisher
|
||||||
|
textualRating: str
|
||||||
|
|
||||||
|
class Claim(BaseModel):
|
||||||
|
claimReview: List[ClaimReview]
|
||||||
|
claimant: str
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class Summary(BaseModel):
|
||||||
|
fact_checking_sites_queried: int
|
||||||
|
total_sources: int
|
||||||
|
|
||||||
|
class TokenUsage(BaseModel):
|
||||||
|
prompt_tokens: int
|
||||||
|
completion_tokens: int
|
||||||
|
total_tokens: int
|
||||||
|
|
||||||
|
class VerificationResult(BaseModel):
|
||||||
|
verdict: str
|
||||||
|
confidence: str
|
||||||
|
evidence: List[str]
|
||||||
|
reasoning: str
|
||||||
|
|
||||||
|
class EnhancedFactCheckResponse(BaseModel):
|
||||||
|
query: str
|
||||||
|
results: List[Claim]
|
||||||
|
sources: List[str]
|
||||||
|
summary: Summary
|
||||||
|
token_usage: Dict[str, int]
|
||||||
|
total_claims_found: int
|
||||||
|
verification_result: VerificationResult
|
||||||
Loading…
Add table
Reference in a new issue