base code added
This commit is contained in:
parent
83a886960b
commit
1a1a713e0f
10 changed files with 656 additions and 48 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,4 +1,4 @@
|
|||
env
|
||||
.env
|
||||
test.py
|
||||
/__pycache__/
|
||||
__pycache__
|
||||
Binary file not shown.
Binary file not shown.
112
app/api/ai_fact_check.py
Normal file
112
app/api/ai_fact_check.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
from app.services.openai_client import OpenAIClient, AIFactChecker
|
||||
from app.config import OPENAI_API_KEY
|
||||
from app.models.fact_check_models import (
|
||||
AIFactCheckRequest,
|
||||
AIFactCheckResponse,
|
||||
VerificationResult,
|
||||
TokenUsage,
|
||||
ErrorResponse
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
import asyncio
|
||||
|
||||
# Initialize router and OpenAI client
|
||||
aifact_check_router = APIRouter()
|
||||
openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
|
||||
fact_checker = AIFactChecker(openai_client=openai_client)
|
||||
|
||||
@aifact_check_router.post(
|
||||
"/aicheck-facts",
|
||||
response_model=AIFactCheckResponse,
|
||||
responses={
|
||||
400: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse}
|
||||
}
|
||||
)
|
||||
async def ai_fact_check(request: AIFactCheckRequest):
|
||||
"""
|
||||
Endpoint to fact-check a given statement based on multiple webpage URLs.
|
||||
Input:
|
||||
- urls: List of webpage URLs to analyze (with or without http/https)
|
||||
- content: The fact statement to verify
|
||||
Response:
|
||||
- JSON response with verification results per URL, sources, and token usage
|
||||
"""
|
||||
try:
|
||||
results = {}
|
||||
all_sources = set()
|
||||
all_contexts = []
|
||||
total_prompt_tokens = 0
|
||||
total_completion_tokens = 0
|
||||
total_tokens = 0
|
||||
|
||||
# Process all URLs concurrently
|
||||
tasks = [
|
||||
fact_checker.check_fact(url=url, query=request.content)
|
||||
for url in request.urls
|
||||
]
|
||||
fact_check_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for url, result in zip(request.urls, fact_check_results):
|
||||
if isinstance(result, Exception):
|
||||
# Handle failed URL checks
|
||||
results[url] = VerificationResult(
|
||||
verdict="Error",
|
||||
confidence="Low",
|
||||
evidence=f"Error checking URL: {str(result)}",
|
||||
reasoning="URL processing failed",
|
||||
missing_info="Could not access or process the URL"
|
||||
)
|
||||
continue
|
||||
|
||||
verification_result = VerificationResult(
|
||||
verdict=result["verification_result"]["verdict"],
|
||||
confidence=result["verification_result"]["confidence"],
|
||||
evidence=result["verification_result"]["evidence"],
|
||||
reasoning=result["verification_result"]["reasoning"],
|
||||
missing_info=result["verification_result"].get("missing_info", None)
|
||||
)
|
||||
|
||||
results[url] = verification_result
|
||||
all_sources.update(result["sources"])
|
||||
all_contexts.extend(result["context_used"])
|
||||
|
||||
# Accumulate token usage
|
||||
total_prompt_tokens += result["token_usage"]["prompt_tokens"]
|
||||
total_completion_tokens += result["token_usage"]["completion_tokens"]
|
||||
total_tokens += result["token_usage"]["total_tokens"]
|
||||
|
||||
token_usage = TokenUsage(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=total_completion_tokens,
|
||||
total_tokens=total_tokens
|
||||
)
|
||||
|
||||
return AIFactCheckResponse(
|
||||
query=request.content,
|
||||
verification_result=results,
|
||||
sources=list(all_sources),
|
||||
context_used=all_contexts,
|
||||
token_usage=token_usage
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=ErrorResponse(
|
||||
detail=str(e),
|
||||
error_code="INVALID_URL",
|
||||
path="/aicheck-facts"
|
||||
).dict()
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=ErrorResponse(
|
||||
detail=f"Error processing fact-check request: {str(e)}",
|
||||
error_code="PROCESSING_ERROR",
|
||||
path="/aicheck-facts"
|
||||
).dict()
|
||||
)
|
||||
|
|
@ -1,11 +1,15 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
from typing import Dict, List
|
||||
|
||||
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL
|
||||
from app.models.fact_check_models import (
|
||||
FactCheckResponse, FactCheckRequest, Claim, ErrorResponse
|
||||
GoogleFactCheckRequest as FactCheckRequest,
|
||||
GoogleFactCheckResponse as FactCheckResponse,
|
||||
Claim,
|
||||
ErrorResponse,
|
||||
TokenUsage
|
||||
)
|
||||
from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources
|
||||
|
||||
|
|
@ -32,6 +36,7 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
|||
Check facts using multiple fact-checking sources
|
||||
"""
|
||||
all_results = []
|
||||
verified_results = []
|
||||
|
||||
# Validate configuration
|
||||
if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
|
||||
|
|
@ -46,6 +51,8 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
|||
|
||||
# Get all sources in priority order
|
||||
all_sources = get_all_sources()
|
||||
all_sources_list = [] # To store source URLs
|
||||
contexts_used = [] # To store context snippets
|
||||
|
||||
for source in all_sources:
|
||||
try:
|
||||
|
|
@ -58,11 +65,17 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
|||
|
||||
if "claims" in result:
|
||||
# Validate each claim through Pydantic
|
||||
validated_claims = [
|
||||
Claim(**claim).dict()
|
||||
for claim in result["claims"]
|
||||
]
|
||||
all_results.extend(validated_claims)
|
||||
for claim in result["claims"]:
|
||||
validated_claim = Claim(**claim).dict()
|
||||
all_results.append(validated_claim)
|
||||
|
||||
# Extract source and context information
|
||||
if "claimReview" in validated_claim:
|
||||
review = validated_claim["claimReview"][0]
|
||||
if "publisher" in review and "site" in review["publisher"]:
|
||||
all_sources_list.append(review["publisher"]["site"])
|
||||
if "textualRating" in review:
|
||||
contexts_used.append(review["textualRating"])
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
|
|
@ -81,14 +94,33 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse:
|
|||
).dict()
|
||||
)
|
||||
|
||||
# Create the response using Pydantic model
|
||||
# Prepare the verification result
|
||||
verification_result = {
|
||||
"verdict": "Insufficient Information", # Default verdict
|
||||
"confidence": "Low",
|
||||
"evidence": contexts_used,
|
||||
"reasoning": "Based on available fact checks",
|
||||
"missing_info": "Additional verification may be needed"
|
||||
}
|
||||
|
||||
# Create token usage information
|
||||
token_usage = TokenUsage(
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
total_tokens=0
|
||||
)
|
||||
|
||||
# Create the response using Pydantic model with all required fields
|
||||
response = FactCheckResponse(
|
||||
query=request.content,
|
||||
total_claims_found=len(all_results),
|
||||
results=all_results,
|
||||
verification_result=verification_result,
|
||||
sources=list(set(all_sources_list)),
|
||||
context_used=contexts_used,
|
||||
token_usage=token_usage,
|
||||
summary={
|
||||
"total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "")
|
||||
for claim in all_results if claim.get("claimReview"))),
|
||||
"total_sources": len(set(all_sources_list)),
|
||||
"fact_checking_sites_queried": len(all_sources)
|
||||
}
|
||||
)
|
||||
|
|
|
|||
160
app/api/scrap_websites.py
Normal file
160
app/api/scrap_websites.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import numpy as np
|
||||
from app.services.openai_client import OpenAIClient
|
||||
from app.config import OPENAI_API_KEY
|
||||
|
||||
scrap_websites_router = APIRouter()
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
search_text: str
|
||||
site_domains: List[str]
|
||||
|
||||
class UrlSimilarityInfo(BaseModel):
|
||||
url: str
|
||||
similarity: float
|
||||
extracted_text: str
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
results: Dict[str, List[str]]
|
||||
error_messages: Dict[str, str]
|
||||
url_similarities: Dict[str, List[UrlSimilarityInfo]]
|
||||
|
||||
def extract_url_text(url: str) -> str:
|
||||
"""Extract and process meaningful text from URL path with improved cleaning"""
|
||||
try:
|
||||
# Parse the URL and get the path
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path = parsed.path
|
||||
|
||||
# Remove common URL parts and file extensions
|
||||
path = path.replace('.html', '').replace('/index', '').replace('.php', '')
|
||||
|
||||
# Split path into segments
|
||||
segments = [seg for seg in path.split('/') if seg]
|
||||
|
||||
# Remove dates and numbers
|
||||
cleaned_segments = []
|
||||
for segment in segments:
|
||||
# Replace hyphens and underscores with spaces
|
||||
segment = segment.replace('-', ' ').replace('_', ' ')
|
||||
|
||||
# Filter out segments that are just dates or numbers
|
||||
if not (segment.replace(' ', '').isdigit() or
|
||||
all(part.isdigit() for part in segment.split() if part)):
|
||||
cleaned_segments.append(segment)
|
||||
|
||||
# Remove very common words that don't add meaning
|
||||
common_words = {
|
||||
'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk',
|
||||
'updates', 'update', 'latest', 'breaking', 'new', 'article'
|
||||
}
|
||||
|
||||
# Join segments and split into words
|
||||
text = ' '.join(cleaned_segments)
|
||||
words = [word.lower() for word in text.split()
|
||||
if word.lower() not in common_words and len(word) > 1]
|
||||
|
||||
return ' '.join(words)
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
def google_search_scraper(search_text: str, site_domain: str) -> List[str]:
|
||||
query = f"{search_text} \"site:{site_domain}\""
|
||||
encoded_query = urllib.parse.quote(query)
|
||||
base_url = "https://www.google.com/search"
|
||||
url = f"{base_url}?q={encoded_query}"
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
search_results = soup.find_all('div', class_='g')
|
||||
|
||||
urls = []
|
||||
for result in search_results[:5]:
|
||||
link = result.find('a')
|
||||
if link and 'href' in link.attrs:
|
||||
url = link['href']
|
||||
if url.startswith('http'):
|
||||
urls.append(url)
|
||||
|
||||
return urls[:5]
|
||||
|
||||
except requests.RequestException as e:
|
||||
raise HTTPException(status_code=500, detail=f"Error scraping {site_domain}: {str(e)}")
|
||||
|
||||
def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float:
|
||||
query_array = np.array(query_embedding)
|
||||
url_array = np.array(url_embedding)
|
||||
|
||||
similarity = np.dot(url_array, query_array) / (
|
||||
np.linalg.norm(url_array) * np.linalg.norm(query_array)
|
||||
)
|
||||
return float(similarity)
|
||||
|
||||
@scrap_websites_router.post("/search", response_model=SearchResponse)
|
||||
async def search_websites(request: SearchRequest):
|
||||
results = {}
|
||||
error_messages = {}
|
||||
url_similarities = {}
|
||||
|
||||
# Initialize OpenAI client
|
||||
openai_client = OpenAIClient(OPENAI_API_KEY)
|
||||
|
||||
# Enhance search text with key terms
|
||||
search_context = request.search_text
|
||||
query_embedding = openai_client.get_embeddings([search_context])[0]
|
||||
|
||||
# Higher similarity threshold for better filtering
|
||||
SIMILARITY_THRESHOLD = 0.75
|
||||
|
||||
for domain in request.site_domains:
|
||||
try:
|
||||
urls = google_search_scraper(request.search_text, domain)
|
||||
url_sims = []
|
||||
valid_urls = []
|
||||
|
||||
for url in urls:
|
||||
url_text = extract_url_text(url)
|
||||
|
||||
# Skip URLs with no meaningful text extracted
|
||||
if not url_text:
|
||||
continue
|
||||
|
||||
url_embedding = openai_client.get_embeddings([url_text])[0]
|
||||
similarity = calculate_similarity(query_embedding, url_embedding)
|
||||
|
||||
url_sims.append(UrlSimilarityInfo(
|
||||
url=url,
|
||||
similarity=similarity,
|
||||
extracted_text=url_text
|
||||
))
|
||||
|
||||
if similarity >= SIMILARITY_THRESHOLD:
|
||||
valid_urls.append(url)
|
||||
|
||||
results[domain] = valid_urls
|
||||
url_similarities[domain] = sorted(url_sims,
|
||||
key=lambda x: x.similarity,
|
||||
reverse=True)
|
||||
|
||||
except HTTPException as e:
|
||||
error_messages[domain] = str(e.detail)
|
||||
except Exception as e:
|
||||
error_messages[domain] = f"Unexpected error for {domain}: {str(e)}"
|
||||
|
||||
return SearchResponse(
|
||||
results=results,
|
||||
error_messages=error_messages,
|
||||
url_similarities=url_similarities
|
||||
)
|
||||
Binary file not shown.
|
|
@ -1,7 +1,14 @@
|
|||
from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Common Models
|
||||
class TokenUsage(BaseModel):
|
||||
prompt_tokens: Optional[int] = 0
|
||||
completion_tokens: Optional[int] = 0
|
||||
total_tokens: Optional[int] = 0
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
detail: str
|
||||
|
|
@ -18,11 +25,7 @@ class ErrorResponse(BaseModel):
|
|||
}
|
||||
})
|
||||
|
||||
class RequestValidationError(BaseModel):
|
||||
loc: List[str]
|
||||
msg: str
|
||||
type: str
|
||||
|
||||
# Fact Check Models
|
||||
class Publisher(BaseModel):
|
||||
name: str
|
||||
site: Optional[str] = Field(None, description="Publisher's website")
|
||||
|
|
@ -47,11 +50,116 @@ class Claim(BaseModel):
|
|||
claimDate: Optional[str] = None
|
||||
claimReview: List[ClaimReview]
|
||||
|
||||
class FactCheckResponse(BaseModel):
|
||||
query: str = Field(..., description="Original query that was fact-checked")
|
||||
total_claims_found: int = Field(..., ge=0)
|
||||
results: List[Claim] = Field(default_factory=list)
|
||||
summary: Dict[str, int] = Field(...)
|
||||
class SourceType(str, Enum):
|
||||
FACT_CHECKER = "fact_checker"
|
||||
NEWS_SITE = "news_site"
|
||||
|
||||
class FactCheckSource(BaseModel):
|
||||
domain: str
|
||||
type: SourceType
|
||||
priority: int = Field(default=1, ge=1, le=10)
|
||||
|
||||
# Verification Models
|
||||
class VerificationResult(BaseModel):
|
||||
verdict: str = Field(..., description="True/False/Insufficient Information")
|
||||
confidence: str = Field(..., description="High/Medium/Low")
|
||||
evidence: Union[str, List[str]]
|
||||
reasoning: str
|
||||
missing_info: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Direct quote from source supporting the claim"],
|
||||
"reasoning": "Detailed analysis of why the claim is considered true",
|
||||
"missing_info": "Any caveats or limitations of the verification"
|
||||
}
|
||||
})
|
||||
|
||||
# Request Models
|
||||
class BaseFactCheckRequest(BaseModel):
|
||||
content: str = Field(
|
||||
...,
|
||||
min_length=10,
|
||||
max_length=1000,
|
||||
description="The claim to be fact-checked"
|
||||
)
|
||||
|
||||
@validator('content')
|
||||
def validate_content(cls, v):
|
||||
if not v.strip():
|
||||
raise ValueError("Content cannot be empty or just whitespace")
|
||||
return v.strip()
|
||||
|
||||
class GoogleFactCheckRequest(BaseFactCheckRequest):
|
||||
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
|
||||
max_results_per_source: int = Field(default=10, ge=1, le=50)
|
||||
|
||||
class AIFactCheckRequest(BaseFactCheckRequest):
|
||||
urls: List[str] = Field(
|
||||
...,
|
||||
min_items=1,
|
||||
max_items=5,
|
||||
description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
|
||||
)
|
||||
|
||||
@validator('urls')
|
||||
def validate_urls(cls, urls):
|
||||
validated_urls = []
|
||||
for url in urls:
|
||||
if not url.strip():
|
||||
raise ValueError("URL cannot be empty")
|
||||
|
||||
# Add https:// if no protocol specified
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = f'https://{url}'
|
||||
|
||||
try:
|
||||
result = urlparse(url)
|
||||
if not result.netloc:
|
||||
raise ValueError(f"Invalid URL structure for {url}")
|
||||
validated_urls.append(url)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid URL {url}: {str(e)}")
|
||||
|
||||
return validated_urls
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"content": "Indian flag was drawn in BUET campus",
|
||||
"urls": [
|
||||
"www.altnews.in/article-about-flag",
|
||||
"www.another-source.com/related-news"
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
# Response Models
|
||||
class BaseFactCheckResponse(BaseModel):
|
||||
query: str
|
||||
token_usage: TokenUsage
|
||||
sources: List[str]
|
||||
context_used: List[str]
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"query": "Example statement to verify",
|
||||
"token_usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
},
|
||||
"sources": ["source1.com", "source2.com"],
|
||||
"context_used": ["Relevant context from sources"]
|
||||
}
|
||||
})
|
||||
|
||||
class GoogleFactCheckResponse(BaseFactCheckResponse):
|
||||
total_claims_found: int
|
||||
results: List[Dict[str, Any]]
|
||||
verification_result: Dict[str, Any]
|
||||
summary: Dict[str, int]
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
|
|
@ -68,6 +176,19 @@ class FactCheckResponse(BaseModel):
|
|||
"textualRating": "True"
|
||||
}]
|
||||
}],
|
||||
"verification_result": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Supporting evidence"],
|
||||
"reasoning": "Detailed analysis"
|
||||
},
|
||||
"sources": ["factchecker.com"],
|
||||
"context_used": ["Relevant context"],
|
||||
"token_usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
},
|
||||
"summary": {
|
||||
"total_sources": 1,
|
||||
"fact_checking_sites_queried": 10
|
||||
|
|
@ -75,35 +196,41 @@ class FactCheckResponse(BaseModel):
|
|||
}
|
||||
})
|
||||
|
||||
class SourceType(str, Enum):
|
||||
FACT_CHECKER = "fact_checker"
|
||||
NEWS_SITE = "news_site"
|
||||
|
||||
class FactCheckSource(BaseModel):
|
||||
domain: str
|
||||
type: SourceType
|
||||
priority: int = Field(default=1, ge=1, le=10)
|
||||
class AIFactCheckResponse(BaseFactCheckResponse):
|
||||
verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={
|
||||
"example": {
|
||||
"domain": "factcheck.org",
|
||||
"type": "fact_checker",
|
||||
"priority": 1
|
||||
"query": "Indian flag was drawn in BUET campus",
|
||||
"verification_result": {
|
||||
"https://www.source1.com": {
|
||||
"verdict": "True",
|
||||
"confidence": "High",
|
||||
"evidence": ["Supporting evidence from source 1"],
|
||||
"reasoning": "Detailed analysis from source 1",
|
||||
"missing_info": None
|
||||
},
|
||||
"https://www.source2.com": {
|
||||
"verdict": "True",
|
||||
"confidence": "Medium",
|
||||
"evidence": ["Supporting evidence from source 2"],
|
||||
"reasoning": "Analysis from source 2",
|
||||
"missing_info": "Additional context needed"
|
||||
}
|
||||
},
|
||||
"sources": ["source1.com", "source2.com"],
|
||||
"context_used": [
|
||||
"Context from source 1",
|
||||
"Context from source 2"
|
||||
],
|
||||
"token_usage": {
|
||||
"prompt_tokens": 200,
|
||||
"completion_tokens": 100,
|
||||
"total_tokens": 300
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
class FactCheckRequest(BaseModel):
|
||||
content: str = Field(
|
||||
...,
|
||||
min_length=10,
|
||||
max_length=1000,
|
||||
description="The claim to be fact-checked"
|
||||
)
|
||||
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
|
||||
max_results_per_source: int = Field(default=10, ge=1, le=50)
|
||||
|
||||
@validator('content')
|
||||
def validate_content(cls, v):
|
||||
if not v.strip():
|
||||
raise ValueError("Content cannot be empty or just whitespace")
|
||||
return v.strip()
|
||||
# Backwards compatibility aliases
|
||||
FactCheckRequest = GoogleFactCheckRequest
|
||||
FactCheckResponse = GoogleFactCheckResponse
|
||||
173
app/services/openai_client.py
Normal file
173
app/services/openai_client.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.documents import Document
|
||||
from typing import List, Dict, Any
|
||||
import numpy as np
|
||||
import logging as logger
|
||||
import openai
|
||||
import json
|
||||
|
||||
class OpenAIClient:
|
||||
def __init__(self, api_key: str):
|
||||
"""
|
||||
Initialize OpenAI client with the provided API key.
|
||||
"""
|
||||
openai.api_key = api_key
|
||||
|
||||
async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict:
|
||||
"""
|
||||
Generate a response using OpenAI's chat completion API.
|
||||
"""
|
||||
try:
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
content = response['choices'][0]['message']['content']
|
||||
# Parse the JSON string into a dictionary
|
||||
parsed_content = json.loads(content)
|
||||
|
||||
return {
|
||||
"response": parsed_content, # Now returns a dictionary instead of string
|
||||
"prompt_tokens": response['usage']['prompt_tokens'],
|
||||
"completion_tokens": response['usage']['completion_tokens'],
|
||||
"total_tokens": response['usage']['total_tokens']
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI text generation error: {str(e)}")
|
||||
|
||||
def get_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Retrieve embeddings for a list of texts using OpenAI's embedding API.
|
||||
"""
|
||||
try:
|
||||
response = openai.Embedding.create(
|
||||
input=texts,
|
||||
model="text-embedding-ada-002"
|
||||
)
|
||||
embeddings = [data['embedding'] for data in response['data']]
|
||||
return embeddings
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding error: {str(e)}")
|
||||
|
||||
class AIFactChecker:
|
||||
def __init__(self, openai_client: OpenAIClient):
|
||||
"""Initialize the fact checker with OpenAI client."""
|
||||
self.openai_client = openai_client
|
||||
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
||||
)
|
||||
|
||||
async def scrape_webpage(self, url: str) -> List[Document]:
|
||||
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
|
||||
try:
|
||||
loader = AsyncHtmlLoader([url])
|
||||
docs = await loader.aload()
|
||||
|
||||
bs_transformer = BeautifulSoupTransformer()
|
||||
docs_transformed = bs_transformer.transform_documents(docs)
|
||||
docs_chunks = self.text_splitter.split_documents(docs_transformed)
|
||||
|
||||
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
|
||||
return docs_chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
|
||||
raise
|
||||
|
||||
def find_relevant_chunks(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
doc_embeddings: List[List[float]],
|
||||
docs: List[Document]
|
||||
) -> List[Document]:
|
||||
"""Find most relevant document chunks using cosine similarity."""
|
||||
try:
|
||||
query_array = np.array(query_embedding)
|
||||
chunks_array = np.array(doc_embeddings)
|
||||
|
||||
similarities = np.dot(chunks_array, query_array) / (
|
||||
np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array)
|
||||
)
|
||||
|
||||
top_indices = np.argsort(similarities)[-5:][::-1]
|
||||
return [docs[i] for i in top_indices]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error finding relevant chunks | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]:
|
||||
"""Verify fact using OpenAI's API with context from relevant documents."""
|
||||
try:
|
||||
context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
||||
|
||||
system_prompt = """You are a professional fact-checking assistant. Analyze the provided context
|
||||
and determine if the given statement is true, false, or if there isn't enough information.
|
||||
|
||||
Provide your response in the following JSON format:
|
||||
{
|
||||
"verdict": "True/False/Insufficient Information",
|
||||
"confidence": "High/Medium/Low",
|
||||
"evidence": "Direct quotes or evidence from the context",
|
||||
"reasoning": "Your detailed analysis and reasoning",
|
||||
"missing_info": "Any important missing information (if applicable)"
|
||||
}"""
|
||||
|
||||
user_prompt = f"""Context:
|
||||
{context}
|
||||
|
||||
Statement to verify: "{query}"
|
||||
|
||||
Analyze the statement based on the provided context and return your response in the specified JSON format."""
|
||||
|
||||
response = await self.openai_client.generate_text_response(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
max_tokens=800
|
||||
)
|
||||
|
||||
sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs]))
|
||||
|
||||
return {
|
||||
"verification_result": response["response"], # This is now a dictionary
|
||||
"sources": sources,
|
||||
"context_used": [doc.page_content for doc in relevant_docs],
|
||||
"token_usage": {
|
||||
"prompt_tokens": response["prompt_tokens"],
|
||||
"completion_tokens": response["completion_tokens"],
|
||||
"total_tokens": response["total_tokens"]
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error verifying fact | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def check_fact(self, url: str, query: str) -> Dict[str, Any]:
|
||||
"""Main method to check a fact against a webpage."""
|
||||
try:
|
||||
docs = await self.scrape_webpage(url)
|
||||
|
||||
doc_texts = [doc.page_content for doc in docs]
|
||||
doc_embeddings = self.openai_client.get_embeddings(doc_texts)
|
||||
query_embedding = self.openai_client.get_embeddings([query])
|
||||
|
||||
relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs)
|
||||
verification_result = await self.verify_fact(query, relevant_docs)
|
||||
|
||||
return verification_result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking fact | error={str(e)}")
|
||||
raise
|
||||
4
main.py
4
main.py
|
|
@ -1,6 +1,8 @@
|
|||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.api.fact_check import fact_check_router
|
||||
from app.api.ai_fact_check import aifact_check_router
|
||||
from app.api.scrap_websites import scrap_websites_router
|
||||
from app.config import FRONTEND_URL
|
||||
|
||||
# Initialize FastAPI app
|
||||
|
|
@ -39,6 +41,8 @@ async def health_check():
|
|||
return {"status": "healthy"}
|
||||
|
||||
app.include_router(fact_check_router, prefix="")
|
||||
app.include_router(aifact_check_router, prefix="")
|
||||
app.include_router(scrap_websites_router, prefix="")
|
||||
|
||||
# Include routers (uncomment and modify as needed)
|
||||
# from routes import some_router
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue