base code added

This commit is contained in:
Utsho Dey 2024-12-12 17:31:44 +06:00
parent 83a886960b
commit 1a1a713e0f
10 changed files with 656 additions and 48 deletions

2
.gitignore vendored
View file

@ -1,4 +1,4 @@
env
.env
test.py
/__pycache__/
__pycache__

112
app/api/ai_fact_check.py Normal file
View file

@ -0,0 +1,112 @@
from fastapi import APIRouter, HTTPException
from app.services.openai_client import OpenAIClient, AIFactChecker
from app.config import OPENAI_API_KEY
from app.models.fact_check_models import (
AIFactCheckRequest,
AIFactCheckResponse,
VerificationResult,
TokenUsage,
ErrorResponse
)
from urllib.parse import urlparse
import asyncio
# Initialize router and OpenAI client
aifact_check_router = APIRouter()
openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
fact_checker = AIFactChecker(openai_client=openai_client)
@aifact_check_router.post(
"/aicheck-facts",
response_model=AIFactCheckResponse,
responses={
400: {"model": ErrorResponse},
500: {"model": ErrorResponse}
}
)
async def ai_fact_check(request: AIFactCheckRequest):
"""
Endpoint to fact-check a given statement based on multiple webpage URLs.
Input:
- urls: List of webpage URLs to analyze (with or without http/https)
- content: The fact statement to verify
Response:
- JSON response with verification results per URL, sources, and token usage
"""
try:
results = {}
all_sources = set()
all_contexts = []
total_prompt_tokens = 0
total_completion_tokens = 0
total_tokens = 0
# Process all URLs concurrently
tasks = [
fact_checker.check_fact(url=url, query=request.content)
for url in request.urls
]
fact_check_results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
for url, result in zip(request.urls, fact_check_results):
if isinstance(result, Exception):
# Handle failed URL checks
results[url] = VerificationResult(
verdict="Error",
confidence="Low",
evidence=f"Error checking URL: {str(result)}",
reasoning="URL processing failed",
missing_info="Could not access or process the URL"
)
continue
verification_result = VerificationResult(
verdict=result["verification_result"]["verdict"],
confidence=result["verification_result"]["confidence"],
evidence=result["verification_result"]["evidence"],
reasoning=result["verification_result"]["reasoning"],
missing_info=result["verification_result"].get("missing_info", None)
)
results[url] = verification_result
all_sources.update(result["sources"])
all_contexts.extend(result["context_used"])
# Accumulate token usage
total_prompt_tokens += result["token_usage"]["prompt_tokens"]
total_completion_tokens += result["token_usage"]["completion_tokens"]
total_tokens += result["token_usage"]["total_tokens"]
token_usage = TokenUsage(
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
total_tokens=total_tokens
)
return AIFactCheckResponse(
query=request.content,
verification_result=results,
sources=list(all_sources),
context_used=all_contexts,
token_usage=token_usage
)
except ValueError as e:
raise HTTPException(
status_code=400,
detail=ErrorResponse(
detail=str(e),
error_code="INVALID_URL",
path="/aicheck-facts"
).dict()
)
except Exception as e:
raise HTTPException(
status_code=500,
detail=ErrorResponse(
detail=f"Error processing fact-check request: {str(e)}",
error_code="PROCESSING_ERROR",
path="/aicheck-facts"
).dict()
)

View file

@ -1,11 +1,15 @@
from fastapi import APIRouter, HTTPException
import json
from datetime import datetime
from typing import Dict
from typing import Dict, List
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
from app.models.fact_check_models import (
FactCheckResponse, FactCheckRequest, Claim, ErrorResponse
GoogleFactCheckRequest as FactCheckRequest,
GoogleFactCheckResponse as FactCheckResponse,
Claim,
ErrorResponse,
TokenUsage
)
from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources
@ -32,6 +36,7 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
Check facts using multiple fact-checking sources
"""
all_results = []
verified_results = []
# Validate configuration
if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
@ -46,6 +51,8 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
# Get all sources in priority order
all_sources = get_all_sources()
all_sources_list = [] # To store source URLs
contexts_used = [] # To store context snippets
for source in all_sources:
try:
@ -58,11 +65,17 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
if "claims" in result:
# Validate each claim through Pydantic
validated_claims = [
Claim(**claim).dict()
for claim in result["claims"]
]
all_results.extend(validated_claims)
for claim in result["claims"]:
validated_claim = Claim(**claim).dict()
all_results.append(validated_claim)
# Extract source and context information
if "claimReview" in validated_claim:
review = validated_claim["claimReview"][0]
if "publisher" in review and "site" in review["publisher"]:
all_sources_list.append(review["publisher"]["site"])
if "textualRating" in review:
contexts_used.append(review["textualRating"])
except HTTPException:
raise
@ -81,14 +94,33 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
).dict()
)
# Create the response using Pydantic model
# Prepare the verification result
verification_result = {
"verdict": "Insufficient Information", # Default verdict
"confidence": "Low",
"evidence": contexts_used,
"reasoning": "Based on available fact checks",
"missing_info": "Additional verification may be needed"
}
# Create token usage information
token_usage = TokenUsage(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0
)
# Create the response using Pydantic model with all required fields
response = FactCheckResponse(
query=request.content,
total_claims_found=len(all_results),
results=all_results,
verification_result=verification_result,
sources=list(set(all_sources_list)),
context_used=contexts_used,
token_usage=token_usage,
summary={
"total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "")
for claim in all_results if claim.get("claimReview"))),
"total_sources": len(set(all_sources_list)),
"fact_checking_sites_queried": len(all_sources)
}
)

160
app/api/scrap_websites.py Normal file
View file

@ -0,0 +1,160 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
import urllib.parse
import numpy as np
from app.services.openai_client import OpenAIClient
from app.config import OPENAI_API_KEY
scrap_websites_router = APIRouter()
class SearchRequest(BaseModel):
search_text: str
site_domains: List[str]
class UrlSimilarityInfo(BaseModel):
url: str
similarity: float
extracted_text: str
class SearchResponse(BaseModel):
results: Dict[str, List[str]]
error_messages: Dict[str, str]
url_similarities: Dict[str, List[UrlSimilarityInfo]]
def extract_url_text(url: str) -> str:
"""Extract and process meaningful text from URL path with improved cleaning"""
try:
# Parse the URL and get the path
parsed = urllib.parse.urlparse(url)
path = parsed.path
# Remove common URL parts and file extensions
path = path.replace('.html', '').replace('/index', '').replace('.php', '')
# Split path into segments
segments = [seg for seg in path.split('/') if seg]
# Remove dates and numbers
cleaned_segments = []
for segment in segments:
# Replace hyphens and underscores with spaces
segment = segment.replace('-', ' ').replace('_', ' ')
# Filter out segments that are just dates or numbers
if not (segment.replace(' ', '').isdigit() or
all(part.isdigit() for part in segment.split() if part)):
cleaned_segments.append(segment)
# Remove very common words that don't add meaning
common_words = {
'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk',
'updates', 'update', 'latest', 'breaking', 'new', 'article'
}
# Join segments and split into words
text = ' '.join(cleaned_segments)
words = [word.lower() for word in text.split()
if word.lower() not in common_words and len(word) > 1]
return ' '.join(words)
except Exception:
return ''
def google_search_scraper(search_text: str, site_domain: str) -> List[str]:
query = f"{search_text} \"site:{site_domain}\""
encoded_query = urllib.parse.quote(query)
base_url = "https://www.google.com/search"
url = f"{base_url}?q={encoded_query}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
search_results = soup.find_all('div', class_='g')
urls = []
for result in search_results[:5]:
link = result.find('a')
if link and 'href' in link.attrs:
url = link['href']
if url.startswith('http'):
urls.append(url)
return urls[:5]
except requests.RequestException as e:
raise HTTPException(status_code=500, detail=f"Error scraping {site_domain}: {str(e)}")
def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float:
query_array = np.array(query_embedding)
url_array = np.array(url_embedding)
similarity = np.dot(url_array, query_array) / (
np.linalg.norm(url_array) * np.linalg.norm(query_array)
)
return float(similarity)
@scrap_websites_router.post("/search", response_model=SearchResponse)
async def search_websites(request: SearchRequest):
results = {}
error_messages = {}
url_similarities = {}
# Initialize OpenAI client
openai_client = OpenAIClient(OPENAI_API_KEY)
# Enhance search text with key terms
search_context = request.search_text
query_embedding = openai_client.get_embeddings([search_context])[0]
# Higher similarity threshold for better filtering
SIMILARITY_THRESHOLD = 0.75
for domain in request.site_domains:
try:
urls = google_search_scraper(request.search_text, domain)
url_sims = []
valid_urls = []
for url in urls:
url_text = extract_url_text(url)
# Skip URLs with no meaningful text extracted
if not url_text:
continue
url_embedding = openai_client.get_embeddings([url_text])[0]
similarity = calculate_similarity(query_embedding, url_embedding)
url_sims.append(UrlSimilarityInfo(
url=url,
similarity=similarity,
extracted_text=url_text
))
if similarity >= SIMILARITY_THRESHOLD:
valid_urls.append(url)
results[domain] = valid_urls
url_similarities[domain] = sorted(url_sims,
key=lambda x: x.similarity,
reverse=True)
except HTTPException as e:
error_messages[domain] = str(e.detail)
except Exception as e:
error_messages[domain] = f"Unexpected error for {domain}: {str(e)}"
return SearchResponse(
results=results,
error_messages=error_messages,
url_similarities=url_similarities
)

View file

@ -1,7 +1,14 @@
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from datetime import datetime
from urllib.parse import urlparse
# Common Models
class TokenUsage(BaseModel):
prompt_tokens: Optional[int] = 0
completion_tokens: Optional[int] = 0
total_tokens: Optional[int] = 0
class ErrorResponse(BaseModel):
detail: str
@ -18,11 +25,7 @@ class ErrorResponse(BaseModel):
}
})
class RequestValidationError(BaseModel):
loc: List[str]
msg: str
type: str
# Fact Check Models
class Publisher(BaseModel):
name: str
site: Optional[str] = Field(None, description="Publisher's website")
@ -47,11 +50,116 @@ class Claim(BaseModel):
claimDate: Optional[str] = None
claimReview: List[ClaimReview]
class FactCheckResponse(BaseModel):
query: str = Field(..., description="Original query that was fact-checked")
total_claims_found: int = Field(..., ge=0)
results: List[Claim] = Field(default_factory=list)
summary: Dict[str, int] = Field(...)
class SourceType(str, Enum):
FACT_CHECKER = "fact_checker"
NEWS_SITE = "news_site"
class FactCheckSource(BaseModel):
domain: str
type: SourceType
priority: int = Field(default=1, ge=1, le=10)
# Verification Models
class VerificationResult(BaseModel):
verdict: str = Field(..., description="True/False/Insufficient Information")
confidence: str = Field(..., description="High/Medium/Low")
evidence: Union[str, List[str]]
reasoning: str
missing_info: Optional[str] = None
model_config = ConfigDict(json_schema_extra={
"example": {
"verdict": "True",
"confidence": "High",
"evidence": ["Direct quote from source supporting the claim"],
"reasoning": "Detailed analysis of why the claim is considered true",
"missing_info": "Any caveats or limitations of the verification"
}
})
# Request Models
class BaseFactCheckRequest(BaseModel):
content: str = Field(
...,
min_length=10,
max_length=1000,
description="The claim to be fact-checked"
)
@validator('content')
def validate_content(cls, v):
if not v.strip():
raise ValueError("Content cannot be empty or just whitespace")
return v.strip()
class GoogleFactCheckRequest(BaseFactCheckRequest):
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
max_results_per_source: int = Field(default=10, ge=1, le=50)
class AIFactCheckRequest(BaseFactCheckRequest):
urls: List[str] = Field(
...,
min_items=1,
max_items=5,
description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
)
@validator('urls')
def validate_urls(cls, urls):
validated_urls = []
for url in urls:
if not url.strip():
raise ValueError("URL cannot be empty")
# Add https:// if no protocol specified
if not url.startswith(('http://', 'https://')):
url = f'https://{url}'
try:
result = urlparse(url)
if not result.netloc:
raise ValueError(f"Invalid URL structure for {url}")
validated_urls.append(url)
except Exception as e:
raise ValueError(f"Invalid URL {url}: {str(e)}")
return validated_urls
model_config = ConfigDict(json_schema_extra={
"example": {
"content": "Indian flag was drawn in BUET campus",
"urls": [
"www.altnews.in/article-about-flag",
"www.another-source.com/related-news"
]
}
})
# Response Models
class BaseFactCheckResponse(BaseModel):
query: str
token_usage: TokenUsage
sources: List[str]
context_used: List[str]
model_config = ConfigDict(json_schema_extra={
"example": {
"query": "Example statement to verify",
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
},
"sources": ["source1.com", "source2.com"],
"context_used": ["Relevant context from sources"]
}
})
class GoogleFactCheckResponse(BaseFactCheckResponse):
total_claims_found: int
results: List[Dict[str, Any]]
verification_result: Dict[str, Any]
summary: Dict[str, int]
model_config = ConfigDict(json_schema_extra={
"example": {
@ -68,6 +176,19 @@ class FactCheckResponse(BaseModel):
"textualRating": "True"
}]
}],
"verification_result": {
"verdict": "True",
"confidence": "High",
"evidence": ["Supporting evidence"],
"reasoning": "Detailed analysis"
},
"sources": ["factchecker.com"],
"context_used": ["Relevant context"],
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
},
"summary": {
"total_sources": 1,
"fact_checking_sites_queried": 10
@ -75,35 +196,41 @@ class FactCheckResponse(BaseModel):
}
})
class SourceType(str, Enum):
FACT_CHECKER = "fact_checker"
NEWS_SITE = "news_site"
class FactCheckSource(BaseModel):
domain: str
type: SourceType
priority: int = Field(default=1, ge=1, le=10)
class AIFactCheckResponse(BaseFactCheckResponse):
verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL
model_config = ConfigDict(json_schema_extra={
"example": {
"domain": "factcheck.org",
"type": "fact_checker",
"priority": 1
"query": "Indian flag was drawn in BUET campus",
"verification_result": {
"https://www.source1.com": {
"verdict": "True",
"confidence": "High",
"evidence": ["Supporting evidence from source 1"],
"reasoning": "Detailed analysis from source 1",
"missing_info": None
},
"https://www.source2.com": {
"verdict": "True",
"confidence": "Medium",
"evidence": ["Supporting evidence from source 2"],
"reasoning": "Analysis from source 2",
"missing_info": "Additional context needed"
}
},
"sources": ["source1.com", "source2.com"],
"context_used": [
"Context from source 1",
"Context from source 2"
],
"token_usage": {
"prompt_tokens": 200,
"completion_tokens": 100,
"total_tokens": 300
}
}
})
class FactCheckRequest(BaseModel):
content: str = Field(
...,
min_length=10,
max_length=1000,
description="The claim to be fact-checked"
)
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
max_results_per_source: int = Field(default=10, ge=1, le=50)
@validator('content')
def validate_content(cls, v):
if not v.strip():
raise ValueError("Content cannot be empty or just whitespace")
return v.strip()
# Backwards compatibility aliases
FactCheckRequest = GoogleFactCheckRequest
FactCheckResponse = GoogleFactCheckResponse

View file

@ -0,0 +1,173 @@
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List, Dict, Any
import numpy as np
import logging as logger
import openai
import json
class OpenAIClient:
def __init__(self, api_key: str):
"""
Initialize OpenAI client with the provided API key.
"""
openai.api_key = api_key
async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict:
"""
Generate a response using OpenAI's chat completion API.
"""
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
max_tokens=max_tokens
)
content = response['choices'][0]['message']['content']
# Parse the JSON string into a dictionary
parsed_content = json.loads(content)
return {
"response": parsed_content, # Now returns a dictionary instead of string
"prompt_tokens": response['usage']['prompt_tokens'],
"completion_tokens": response['usage']['completion_tokens'],
"total_tokens": response['usage']['total_tokens']
}
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
except Exception as e:
raise Exception(f"OpenAI text generation error: {str(e)}")
def get_embeddings(self, texts: List[str]) -> List[List[float]]:
"""
Retrieve embeddings for a list of texts using OpenAI's embedding API.
"""
try:
response = openai.Embedding.create(
input=texts,
model="text-embedding-ada-002"
)
embeddings = [data['embedding'] for data in response['data']]
return embeddings
except Exception as e:
raise Exception(f"OpenAI embedding error: {str(e)}")
class AIFactChecker:
def __init__(self, openai_client: OpenAIClient):
"""Initialize the fact checker with OpenAI client."""
self.openai_client = openai_client
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
async def scrape_webpage(self, url: str) -> List[Document]:
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
try:
loader = AsyncHtmlLoader([url])
docs = await loader.aload()
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs)
docs_chunks = self.text_splitter.split_documents(docs_transformed)
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
return docs_chunks
except Exception as e:
logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
raise
def find_relevant_chunks(
self,
query_embedding: List[float],
doc_embeddings: List[List[float]],
docs: List[Document]
) -> List[Document]:
"""Find most relevant document chunks using cosine similarity."""
try:
query_array = np.array(query_embedding)
chunks_array = np.array(doc_embeddings)
similarities = np.dot(chunks_array, query_array) / (
np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array)
)
top_indices = np.argsort(similarities)[-5:][::-1]
return [docs[i] for i in top_indices]
except Exception as e:
logger.error(f"Error finding relevant chunks | error={str(e)}")
raise
async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]:
"""Verify fact using OpenAI's API with context from relevant documents."""
try:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
system_prompt = """You are a professional fact-checking assistant. Analyze the provided context
and determine if the given statement is true, false, or if there isn't enough information.
Provide your response in the following JSON format:
{
"verdict": "True/False/Insufficient Information",
"confidence": "High/Medium/Low",
"evidence": "Direct quotes or evidence from the context",
"reasoning": "Your detailed analysis and reasoning",
"missing_info": "Any important missing information (if applicable)"
}"""
user_prompt = f"""Context:
{context}
Statement to verify: "{query}"
Analyze the statement based on the provided context and return your response in the specified JSON format."""
response = await self.openai_client.generate_text_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_tokens=800
)
sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs]))
return {
"verification_result": response["response"], # This is now a dictionary
"sources": sources,
"context_used": [doc.page_content for doc in relevant_docs],
"token_usage": {
"prompt_tokens": response["prompt_tokens"],
"completion_tokens": response["completion_tokens"],
"total_tokens": response["total_tokens"]
}
}
except Exception as e:
logger.error(f"Error verifying fact | error={str(e)}")
raise
async def check_fact(self, url: str, query: str) -> Dict[str, Any]:
"""Main method to check a fact against a webpage."""
try:
docs = await self.scrape_webpage(url)
doc_texts = [doc.page_content for doc in docs]
doc_embeddings = self.openai_client.get_embeddings(doc_texts)
query_embedding = self.openai_client.get_embeddings([query])
relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs)
verification_result = await self.verify_fact(query, relevant_docs)
return verification_result
except Exception as e:
logger.error(f"Error checking fact | error={str(e)}")
raise

View file

@ -1,6 +1,8 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.fact_check import fact_check_router
from app.api.ai_fact_check import aifact_check_router
from app.api.scrap_websites import scrap_websites_router
from app.config import FRONTEND_URL
# Initialize FastAPI app
@ -39,6 +41,8 @@ async def health_check():
return {"status": "healthy"}
app.include_router(fact_check_router, prefix="")
app.include_router(aifact_check_router, prefix="")
app.include_router(scrap_websites_router, prefix="")
# Include routers (uncomment and modify as needed)
# from routes import some_router