Dev local #2

Merged
utshodey merged 10 commits from dev-local into dev 2024-12-18 11:42:36 +00:00
16 changed files with 481 additions and 430 deletions
Showing only changes of commit 019e07e1b9 - Show all commits

View file

@ -11,7 +11,6 @@ cache:
stages:
- setup
- lint
- test
before_script:
@ -29,14 +28,6 @@ setup:
- venv/
expire_in: 1 hour
lint:
stage: lint
needs:
- setup
script:
- black --check app/ main.py tests/
- flake8 app/ main.py tests/ --max-line-length=100
test:
stage: test
needs:
@ -47,7 +38,7 @@ test:
# Start FastAPI server
- uvicorn main:app --host 0.0.0.0 --port 8000 &
# Wait for server to start
- sleep 10
- sleep 15
# Test health endpoint
- |
RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/health)

View file

@ -6,7 +6,7 @@ from app.models.ai_fact_check_models import (
AIFactCheckResponse,
VerificationResult,
TokenUsage,
ErrorResponse
ErrorResponse,
)
from urllib.parse import urlparse
import asyncio
@ -16,13 +16,11 @@ aifact_check_router = APIRouter()
openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
fact_checker = AIFactChecker(openai_client=openai_client)
@aifact_check_router.post(
"/aicheck-facts",
response_model=AIFactCheckResponse,
responses={
400: {"model": ErrorResponse},
500: {"model": ErrorResponse}
}
responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
)
async def ai_fact_check(request: AIFactCheckRequest):
"""
@ -57,7 +55,7 @@ async def ai_fact_check(request: AIFactCheckRequest):
confidence="Low",
evidence=f"Error checking URL: {str(result)}",
reasoning="URL processing failed",
missing_info="Could not access or process the URL"
missing_info="Could not access or process the URL",
)
continue
@ -66,7 +64,7 @@ async def ai_fact_check(request: AIFactCheckRequest):
confidence=result["verification_result"]["confidence"],
evidence=result["verification_result"]["evidence"],
reasoning=result["verification_result"]["reasoning"],
missing_info=result["verification_result"].get("missing_info", None)
missing_info=result["verification_result"].get("missing_info", None),
)
results[url] = verification_result
@ -80,24 +78,22 @@ async def ai_fact_check(request: AIFactCheckRequest):
token_usage = TokenUsage(
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
total_tokens=total_tokens
total_tokens=total_tokens,
)
return AIFactCheckResponse(
query=request.content,
verification_result=results,
sources=list(all_sources),
token_usage=token_usage
token_usage=token_usage,
)
except ValueError as e:
raise HTTPException(
status_code=400,
detail=ErrorResponse(
detail=str(e),
error_code="INVALID_URL",
path="/aicheck-facts"
).dict()
detail=str(e), error_code="INVALID_URL", path="/aicheck-facts"
).dict(),
)
except Exception as e:
raise HTTPException(
@ -105,6 +101,6 @@ async def ai_fact_check(request: AIFactCheckRequest):
detail=ErrorResponse(
detail=f"Error processing fact-check request: {str(e)}",
error_code="PROCESSING_ERROR",
path="/aicheck-facts"
).dict()
path="/aicheck-facts",
).dict(),
)

View file

@ -7,13 +7,14 @@ from app.models.fact_check_models import (
FactCheckRequest,
FactCheckResponse,
ErrorResponse,
Source
Source,
)
from app.websites.fact_checker_website import get_all_sources
fact_check_router = APIRouter()
openai_client = OpenAIClient(OPENAI_API_KEY)
async def generate_fact_report(query: str, fact_check_data: dict) -> FactCheckResponse:
"""Generate a fact check report using OpenAI based on the fact check results."""
try:
@ -70,9 +71,7 @@ Ensure all URLs in sources are complete (including https:// if missing) and each
4. Note any conflicting information between sources"""
response = await openai_client.generate_text_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_tokens=1000
system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=1000
)
try:
@ -80,23 +79,22 @@ Ensure all URLs in sources are complete (including https:// if missing) and each
response_data = response["response"]
# Clean up sources before validation
if isinstance(response_data.get('sources'), list):
if isinstance(response_data.get("sources"), list):
cleaned_sources = []
for source in response_data['sources']:
for source in response_data["sources"]:
if isinstance(source, str):
# Convert string sources to Source objects
url = source if source.startswith('http') else f"https://{source}"
cleaned_sources.append({
"url": url,
"name": source
})
url = (
source if source.startswith("http") else f"https://{source}"
)
cleaned_sources.append({"url": url, "name": source})
elif isinstance(source, dict):
# Ensure URL has proper scheme
url = source.get('url', '')
if url and not url.startswith('http'):
source['url'] = f"https://{url}"
url = source.get("url", "")
if url and not url.startswith("http"):
source["url"] = f"https://{url}"
cleaned_sources.append(source)
response_data['sources'] = cleaned_sources
response_data["sources"] = cleaned_sources
fact_check_response = FactCheckResponse(**response_data)
return fact_check_response
@ -108,8 +106,8 @@ Ensure all URLs in sources are complete (including https:// if missing) and each
detail=ErrorResponse(
detail=f"Invalid response format: {str(validation_error)}",
error_code="VALIDATION_ERROR",
path="/check-facts"
).dict()
path="/check-facts",
).dict(),
)
except Exception as e:
@ -119,10 +117,11 @@ Ensure all URLs in sources are complete (including https:// if missing) and each
detail=ErrorResponse(
detail="Error generating fact report",
error_code="FACT_CHECK_ERROR",
path="/check-facts"
).dict()
path="/check-facts",
).dict(),
)
@fact_check_router.post("/check-facts", response_model=FactCheckResponse)
async def check_facts(request: FactCheckRequest):
"""
@ -134,8 +133,8 @@ async def check_facts(request: FactCheckRequest):
detail=ErrorResponse(
detail="Google API key or base URL is not configured",
error_code="CONFIGURATION_ERROR",
path="/check-facts"
).dict()
path="/check-facts",
).dict(),
)
headers = {"Content-Type": "application/json"}
@ -149,14 +148,12 @@ async def check_facts(request: FactCheckRequest):
"query": request.query,
"languageCode": "en-US",
"reviewPublisherSiteFilter": source.domain,
"pageSize": 10
"pageSize": 10,
}
try:
response = await client.get(
GOOGLE_FACT_CHECK_BASE_URL,
params=params,
headers=headers
GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
)
response.raise_for_status()
json_response = response.json()
@ -173,8 +170,7 @@ async def check_facts(request: FactCheckRequest):
try:
search_request = SearchRequest(
search_text=request.query,
source_types=["fact_checkers"]
search_text=request.query, source_types=["fact_checkers"]
)
ai_response = await search_websites(search_request)
@ -187,6 +183,6 @@ async def check_facts(request: FactCheckRequest):
detail=ErrorResponse(
detail="No fact check results found",
error_code="NOT_FOUND",
path="/check-facts"
).dict()
path="/check-facts",
).dict(),
)

View file

@ -7,7 +7,7 @@ from pydantic import BaseModel
from app.models.ai_fact_check_models import (
AIFactCheckRequest,
FactCheckSource,
SourceType
SourceType,
)
from app.websites.fact_checker_website import SOURCES, get_all_sources
from app.api.ai_fact_check import ai_fact_check
@ -18,10 +18,10 @@ class SearchRequest(BaseModel):
search_text: str
source_types: List[str] = ["fact_checkers"]
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
@ -38,39 +38,46 @@ def get_domain_from_url(url: str) -> str:
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
if domain.startswith('www.'):
if domain.startswith("www."):
domain = domain[4:]
return domain
except Exception as e:
logger.error(f"Error extracting domain from URL {url}: {str(e)}")
return ""
def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool:
"""Check if domain matches any source with improved matching logic."""
if not domain:
return False
domain = domain.lower()
if domain.startswith('www.'):
if domain.startswith("www."):
domain = domain[4:]
for source in sources:
source_domain = source.domain.lower()
if source_domain.startswith('www.'):
if source_domain.startswith("www."):
source_domain = source_domain[4:]
if domain == source_domain or domain.endswith('.' + source_domain):
if domain == source_domain or domain.endswith("." + source_domain):
return True
return False
async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str:
async def build_enhanced_search_query(
query: str, sources: List[FactCheckSource]
) -> str:
"""Build search query with site restrictions."""
site_queries = [f"site:{source.domain}" for source in sources]
site_restriction = " OR ".join(site_queries)
return f"({query}) ({site_restriction})"
async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]:
async def google_custom_search(
query: str, sources: List[FactCheckSource], page: int = 1
) -> Optional[Dict]:
"""Perform Google Custom Search with enhanced query."""
enhanced_query = await build_enhanced_search_query(query, sources)
start_index = ((page - 1) * RESULTS_PER_PAGE) + 1
@ -80,7 +87,7 @@ async def google_custom_search(query: str, sources: List[FactCheckSource], page:
"cx": GOOGLE_ENGINE_ID,
"q": enhanced_query,
"num": RESULTS_PER_PAGE,
"start": start_index
"start": start_index,
}
async with httpx.AsyncClient(timeout=30.0) as client:
@ -92,6 +99,7 @@ async def google_custom_search(query: str, sources: List[FactCheckSource], page:
logger.error(f"Search error: {str(e)}")
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
@scrap_websites_router.post("/search")
async def search_websites(request: SearchRequest):
# Get the source types from the request
@ -115,7 +123,9 @@ async def search_websites(request: SearchRequest):
if len(all_urls) >= 50:
break
search_response = await google_custom_search(request.search_text, selected_sources, page)
search_response = await google_custom_search(
request.search_text, selected_sources, page
)
if not search_response or not search_response.get("items"):
break
@ -132,25 +142,23 @@ async def search_websites(request: SearchRequest):
domain_results[domain] = []
if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN:
domain_results[domain].append({
"url": url,
"title": item.get("title", ""),
"snippet": item.get("snippet", "")
})
domain_results[domain].append(
{
"url": url,
"title": item.get("title", ""),
"snippet": item.get("snippet", ""),
}
)
all_urls.append(url)
if len(all_urls) >= 50:
break
if not all_urls:
return {
"status": "no_results",
"urls_found": 0
}
return {"status": "no_results", "urls_found": 0}
fact_check_request = AIFactCheckRequest(
content=request.search_text,
urls=all_urls[:5]
content=request.search_text, urls=all_urls[:5]
)
return await ai_fact_check(fact_check_request)

View file

@ -4,7 +4,7 @@ from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
GOOGLE_FACT_CHECK_BASE_URL = os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"]

View file

@ -4,38 +4,46 @@ from enum import Enum
from datetime import datetime
from urllib.parse import urlparse
# Common Models
class TokenUsage(BaseModel):
prompt_tokens: Optional[int] = 0
completion_tokens: Optional[int] = 0
total_tokens: Optional[int] = 0
class ErrorResponse(BaseModel):
detail: str
error_code: str = Field(..., description="Unique error code for this type of error")
timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
path: Optional[str] = Field(None, description="The endpoint path where error occurred")
path: Optional[str] = Field(
None, description="The endpoint path where error occurred"
)
model_config = ConfigDict(json_schema_extra={
"example": {
"detail": "Error description",
"error_code": "ERROR_CODE",
"timestamp": "2024-12-09T16:49:30.905765",
"path": "/check-facts"
model_config = ConfigDict(
json_schema_extra={
"example": {
"detail": "Error description",
"error_code": "ERROR_CODE",
"timestamp": "2024-12-09T16:49:30.905765",
"path": "/check-facts",
}
}
})
)
# Fact Check Models
class Publisher(BaseModel):
name: str
site: Optional[str] = Field(None, description="Publisher's website")
@validator('site')
@validator("site")
def validate_site(cls, v):
if v and not (v.startswith('http://') or v.startswith('https://')):
if v and not (v.startswith("http://") or v.startswith("https://")):
return f"https://{v}"
return v
class ClaimReview(BaseModel):
publisher: Publisher
url: Optional[HttpUrl] = None
@ -44,21 +52,25 @@ class ClaimReview(BaseModel):
textualRating: Optional[str] = None
languageCode: str = Field(default="en-US")
class Claim(BaseModel):
text: str
claimant: Optional[str] = None
claimDate: Optional[str] = None
claimReview: List[ClaimReview]
class SourceType(str, Enum):
FACT_CHECKER = "fact_checker"
NEWS_SITE = "news_site"
class FactCheckSource(BaseModel):
domain: str
type: SourceType
priority: int = Field(default=1, ge=1, le=10)
# Verification Models
class VerificationResult(BaseModel):
verdict: str = Field(..., description="True/False/Insufficient Information")
@ -67,44 +79,46 @@ class VerificationResult(BaseModel):
reasoning: str
missing_info: Optional[str] = None
model_config = ConfigDict(json_schema_extra={
"example": {
"verdict": "True",
"confidence": "High",
"evidence": ["Direct quote from source supporting the claim"],
"reasoning": "Detailed analysis of why the claim is considered true",
"missing_info": "Any caveats or limitations of the verification"
model_config = ConfigDict(
json_schema_extra={
"example": {
"verdict": "True",
"confidence": "High",
"evidence": ["Direct quote from source supporting the claim"],
"reasoning": "Detailed analysis of why the claim is considered true",
"missing_info": "Any caveats or limitations of the verification",
}
}
})
)
# Request Models
class BaseFactCheckRequest(BaseModel):
content: str = Field(
...,
min_length=10,
max_length=1000,
description="The claim to be fact-checked"
..., min_length=10, max_length=1000, description="The claim to be fact-checked"
)
@validator('content')
@validator("content")
def validate_content(cls, v):
if not v.strip():
raise ValueError("Content cannot be empty or just whitespace")
return v.strip()
class GoogleFactCheckRequest(BaseFactCheckRequest):
language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$")
max_results_per_source: int = Field(default=10, ge=1, le=50)
class AIFactCheckRequest(BaseFactCheckRequest):
urls: List[str] = Field(
...,
min_items=1,
max_items=5,
description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing"
description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing",
)
@validator('urls')
@validator("urls")
def validate_urls(cls, urls):
validated_urls = []
for url in urls:
@ -112,8 +126,8 @@ class AIFactCheckRequest(BaseFactCheckRequest):
raise ValueError("URL cannot be empty")
# Add https:// if no protocol specified
if not url.startswith(('http://', 'https://')):
url = f'https://{url}'
if not url.startswith(("http://", "https://")):
url = f"https://{url}"
try:
result = urlparse(url)
@ -125,15 +139,18 @@ class AIFactCheckRequest(BaseFactCheckRequest):
return validated_urls
model_config = ConfigDict(json_schema_extra={
"example": {
"content": "Indian flag was drawn in BUET campus",
"urls": [
"www.altnews.in/article-about-flag",
"www.another-source.com/related-news"
]
model_config = ConfigDict(
json_schema_extra={
"example": {
"content": "Indian flag was drawn in BUET campus",
"urls": [
"www.altnews.in/article-about-flag",
"www.another-source.com/related-news",
],
}
}
})
)
# Response Models
class BaseFactCheckResponse(BaseModel):
@ -141,17 +158,20 @@ class BaseFactCheckResponse(BaseModel):
token_usage: TokenUsage
sources: List[str]
model_config = ConfigDict(json_schema_extra={
"example": {
"query": "Example statement to verify",
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
},
"sources": ["source1.com", "source2.com"],
model_config = ConfigDict(
json_schema_extra={
"example": {
"query": "Example statement to verify",
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150,
},
"sources": ["source1.com", "source2.com"],
}
}
})
)
class GoogleFactCheckResponse(BaseFactCheckResponse):
total_claims_found: int
@ -159,70 +179,79 @@ class GoogleFactCheckResponse(BaseFactCheckResponse):
verification_result: Dict[str, Any]
summary: Dict[str, int]
model_config = ConfigDict(json_schema_extra={
"example": {
"query": "Example claim",
"total_claims_found": 1,
"results": [{
"text": "Example claim text",
"claimant": "Source name",
"claimReview": [{
"publisher": {
"name": "Fact Checker",
"site": "factchecker.com"
},
"textualRating": "True"
}]
}],
"verification_result": {
"verdict": "True",
"confidence": "High",
"evidence": ["Supporting evidence"],
"reasoning": "Detailed analysis"
},
"sources": ["factchecker.com"],
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
},
"summary": {
"total_sources": 1,
"fact_checking_sites_queried": 10
}
}
})
class AIFactCheckResponse(BaseFactCheckResponse):
verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL
model_config = ConfigDict(json_schema_extra={
"example": {
"query": "Indian flag was drawn in BUET campus",
"verification_result": {
"https://www.source1.com": {
model_config = ConfigDict(
json_schema_extra={
"example": {
"query": "Example claim",
"total_claims_found": 1,
"results": [
{
"text": "Example claim text",
"claimant": "Source name",
"claimReview": [
{
"publisher": {
"name": "Fact Checker",
"site": "factchecker.com",
},
"textualRating": "True",
}
],
}
],
"verification_result": {
"verdict": "True",
"confidence": "High",
"evidence": ["Supporting evidence from source 1"],
"reasoning": "Detailed analysis from source 1",
"missing_info": None
"evidence": ["Supporting evidence"],
"reasoning": "Detailed analysis",
},
"https://www.source2.com": {
"verdict": "True",
"confidence": "Medium",
"evidence": ["Supporting evidence from source 2"],
"reasoning": "Analysis from source 2",
"missing_info": "Additional context needed"
}
},
"sources": ["source1.com", "source2.com"],
"token_usage": {
"prompt_tokens": 200,
"completion_tokens": 100,
"total_tokens": 300
"sources": ["factchecker.com"],
"token_usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150,
},
"summary": {"total_sources": 1, "fact_checking_sites_queried": 10},
}
}
})
)
class AIFactCheckResponse(BaseFactCheckResponse):
verification_result: Dict[
str, VerificationResult
] # Changed to Dict to store results per URL
model_config = ConfigDict(
json_schema_extra={
"example": {
"query": "Indian flag was drawn in BUET campus",
"verification_result": {
"https://www.source1.com": {
"verdict": "True",
"confidence": "High",
"evidence": ["Supporting evidence from source 1"],
"reasoning": "Detailed analysis from source 1",
"missing_info": None,
},
"https://www.source2.com": {
"verdict": "True",
"confidence": "Medium",
"evidence": ["Supporting evidence from source 2"],
"reasoning": "Analysis from source 2",
"missing_info": "Additional context needed",
},
},
"sources": ["source1.com", "source2.com"],
"token_usage": {
"prompt_tokens": 200,
"completion_tokens": 100,
"total_tokens": 300,
},
}
}
)
# Backwards compatibility aliases
FactCheckRequest = GoogleFactCheckRequest

View file

@ -3,74 +3,73 @@ from typing import List, Literal, Union
from datetime import datetime
from enum import Enum
class VerdictEnum(str, Enum):
TRUE = "True"
FALSE = "False"
PARTIALLY_TRUE = "Partially True"
UNVERIFIED = "Unverified"
class ConfidenceEnum(str, Enum):
HIGH = "High"
MEDIUM = "Medium"
LOW = "Low"
class FactCheckRequest(BaseModel):
query: str = Field(
...,
min_length=3,
max_length=500,
description="The claim or statement to be fact-checked",
example="Did NASA confirm finding alien structures on Mars in 2024?"
example="Did NASA confirm finding alien structures on Mars in 2024?",
)
class Source(BaseModel):
url: str
name: str = ""
@validator('url')
@validator("url")
def validate_url(cls, v):
# Basic URL validation without requiring HTTP/HTTPS
if not v or len(v) < 3:
raise ValueError("URL must not be empty and must be at least 3 characters")
return v
class FactCheckResponse(BaseModel):
claim: str = Field(
...,
min_length=10,
max_length=1000,
description="The exact claim being verified"
)
verdict: VerdictEnum = Field(
...,
description="The verification verdict"
description="The exact claim being verified",
)
verdict: VerdictEnum = Field(..., description="The verification verdict")
confidence: ConfidenceEnum = Field(
...,
description="Confidence level in the verdict"
..., description="Confidence level in the verdict"
)
sources: List[Source] = Field(
...,
min_items=1,
description="List of sources used in verification"
..., min_items=1, description="List of sources used in verification"
)
evidence: str = Field(
...,
min_length=20,
max_length=500,
description="Concise summary of key evidence"
description="Concise summary of key evidence",
)
explanation: str = Field(
...,
min_length=50,
max_length=1000,
description="Detailed explanation of verification findings"
description="Detailed explanation of verification findings",
)
additional_context: str = Field(
...,
min_length=20,
max_length=500,
description="Important context about the verification"
description="Important context about the verification",
)
class Config:
@ -82,19 +81,20 @@ class FactCheckResponse(BaseModel):
"sources": [
{
"url": "https://www.nasa.gov/mars-exploration",
"name": "NASA Mars Exploration"
"name": "NASA Mars Exploration",
},
{
"url": "https://factcheck.org/2024/mars-claims",
"name": "FactCheck.org"
}
"name": "FactCheck.org",
},
],
"evidence": "NASA has made no such announcement. Recent Mars rover images show natural rock formations.",
"explanation": "Multiple fact-checking organizations investigated this claim. NASA's official communications and Mars mission reports from 2024 contain no mention of alien structures. The viral images being shared are misidentified natural geological formations.",
"additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images."
"additional_context": "Similar false claims about alien structures on Mars have circulated periodically since the first Mars rovers began sending back images.",
}
}
class ErrorResponse(BaseModel):
detail: str
error_code: str = Field(..., example="VALIDATION_ERROR")

View file

@ -1,38 +1,46 @@
from pydantic import BaseModel
from typing import List, Dict
class SearchRequest(BaseModel):
search_text: str
source_types: List[str] = ["fact_checkers"]
class Publisher(BaseModel):
name: str
site: str
class ClaimReview(BaseModel):
publisher: Publisher
textualRating: str
class Claim(BaseModel):
claimReview: List[ClaimReview]
claimant: str
text: str
class Summary(BaseModel):
fact_checking_sites_queried: int
total_sources: int
class TokenUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class VerificationResult(BaseModel):
verdict: str
confidence: str
evidence: List[str]
reasoning: str
class EnhancedFactCheckResponse(BaseModel):
query: str
results: List[Claim]

View file

@ -9,6 +9,7 @@ import json
import aiohttp
from bs4 import BeautifulSoup
class OpenAIClient:
def __init__(self, api_key: str):
"""
@ -16,7 +17,9 @@ class OpenAIClient:
"""
openai.api_key = api_key
async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict:
async def generate_text_response(
self, system_prompt: str, user_prompt: str, max_tokens: int
) -> dict:
"""
Generate a response using OpenAI's chat completion API.
"""
@ -25,19 +28,19 @@ class OpenAIClient:
model="gpt-4",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
{"role": "user", "content": user_prompt},
],
max_tokens=max_tokens
max_tokens=max_tokens,
)
content = response['choices'][0]['message']['content']
content = response["choices"][0]["message"]["content"]
# Parse the JSON string into a dictionary
parsed_content = json.loads(content)
return {
"response": parsed_content, # Now returns a dictionary instead of string
"prompt_tokens": response['usage']['prompt_tokens'],
"completion_tokens": response['usage']['completion_tokens'],
"total_tokens": response['usage']['total_tokens']
"prompt_tokens": response["usage"]["prompt_tokens"],
"completion_tokens": response["usage"]["completion_tokens"],
"total_tokens": response["usage"]["total_tokens"],
}
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
@ -50,14 +53,14 @@ class OpenAIClient:
"""
try:
response = openai.Embedding.create(
input=texts,
model="text-embedding-ada-002"
input=texts, model="text-embedding-ada-002"
)
embeddings = [data['embedding'] for data in response['data']]
embeddings = [data["embedding"] for data in response["data"]]
return embeddings
except Exception as e:
raise Exception(f"OpenAI embedding error: {str(e)}")
class AIFactChecker:
def __init__(self, openai_client: OpenAIClient):
"""Initialize the fact checker with OpenAI client."""
@ -66,7 +69,7 @@ class AIFactChecker:
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
async def scrape_webpage(self, url: str) -> List[Document]:
@ -75,23 +78,27 @@ class AIFactChecker:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
raise Exception(
f"Failed to fetch URL: {url}, status: {response.status}"
)
html_content = await response.text()
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(html_content, "html.parser")
# Create a Document with the parsed content
doc = Document(
page_content=soup.get_text(separator='\n', strip=True),
metadata={"source": url}
page_content=soup.get_text(separator="\n", strip=True),
metadata={"source": url},
)
# Split into chunks
docs_chunks = self.text_splitter.split_documents([doc])
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
logger.info(
f"Successfully scraped webpage | chunks={len(docs_chunks)}"
)
return docs_chunks
except Exception as e:
@ -102,7 +109,7 @@ class AIFactChecker:
self,
query_embedding: List[float],
doc_embeddings: List[List[float]],
docs: List[Document]
docs: List[Document],
) -> List[Document]:
"""Find most relevant document chunks using cosine similarity."""
try:
@ -120,7 +127,9 @@ class AIFactChecker:
logger.error(f"Error finding relevant chunks | error={str(e)}")
raise
async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]:
async def verify_fact(
self, query: str, relevant_docs: List[Document]
) -> Dict[str, Any]:
"""Verify fact using OpenAI's API with context from relevant documents."""
try:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
@ -145,12 +154,17 @@ class AIFactChecker:
Analyze the statement based on the provided context and return your response in the specified JSON format."""
response = await self.openai_client.generate_text_response(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_tokens=800
system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=800
)
sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs]))
sources = list(
set(
[
doc.metadata.get("source", "Unknown source")
for doc in relevant_docs
]
)
)
return {
"verification_result": response["response"], # This is now a dictionary
@ -158,8 +172,8 @@ class AIFactChecker:
"token_usage": {
"prompt_tokens": response["prompt_tokens"],
"completion_tokens": response["completion_tokens"],
"total_tokens": response["total_tokens"]
}
"total_tokens": response["total_tokens"],
},
}
except Exception as e:
@ -175,7 +189,9 @@ class AIFactChecker:
doc_embeddings = self.openai_client.get_embeddings(doc_texts)
query_embedding = self.openai_client.get_embeddings([query])
relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs)
relevant_docs = self.find_relevant_chunks(
query_embedding[0], doc_embeddings, docs
)
verification_result = await self.verify_fact(query, relevant_docs)
return verification_result

View file

@ -1,120 +1,125 @@
from typing import Dict, List
import requests
from fastapi import HTTPException
from app.models.ai_fact_check_models import FactCheckSource, ErrorResponse, FactCheckRequest, SourceType
from app.models.ai_fact_check_models import (
FactCheckSource,
ErrorResponse,
FactCheckRequest,
SourceType,
)
# Sources configuration with validation
SOURCES = {
"fact_checkers": [
FactCheckSource(domain=domain, type=SourceType.FACT_CHECKER, priority=1)
for domain in [
"snopes.com",
"politifact.com",
"factcheck.org",
"reuters.com/fact-check",
"apnews.com/hub/ap-fact-check",
"bbc.com/news/reality_check",
"fullfact.org",
"afp.com/fact-check",
"truthorfiction.com",
"leadstories.com",
"checkyourfact.com",
"washingtonpost.com/news/fact-checker",
"factcheck.kz",
"poynter.org/ifcn",
"factcheckeu.info",
"africacheck.org",
"thequint.com/webqoof",
"altnews.in",
"facta.news",
"factcheckni.org",
"mythdetector.ge",
"verificado.mx",
"euvsdisinfo.eu",
"factcheck.afp.com",
"newtral.es",
"maldita.es",
"faktograf.hr",
"demagog.org.pl",
"factnameh.com",
"faktiskt.se",
"teyit.org",
"factly.in",
"boom.live",
"stopfake.org",
"factcheck.ge",
"factcheck.kg",
"factcheck.uz",
"factcheck.tj",
"factcheck.az",
"factcheck.am",
"factcheck.md",
"verafiles.org",
"rappler.com/fact-check",
"vera.com.gt",
"chequeado.com",
"aosfatos.org",
"lasillavacia.com/detector-mentiras",
"colombiacheck.com",
"ecuadorchequea.com",
"elsurti.com/checado",
"verificat.cat",
"mafindo.or.id",
"tempo.co/cek-fakta",
"factcheck.mk",
"raskrinkavanje.ba",
"faktograf.hr",
"demagog.cz",
"faktabaari.fi",
"correctiv.org",
"mimikama.at",
"factcheck.vlaanderen",
"factuel.afp.com",
"nieuwscheckers.nl",
"faktisk.no",
"tjekdet.dk",
"ellinikahoaxes.gr",
"faktograf.id",
"stopfake.kz",
"pesacheck.org",
"dubawa.org",
"namibiafactcheck.org.na",
"zimfact.org",
"ghanafact.com",
"factspace.africa",
"factcrescendo.com",
"vishvasnews.com",
"factcheck.lk",
"newschecker.in",
"boomlive.in",
"digiteye.in",
"indiatoday.in/fact-check",
"factcrescendo.com",
"piyasa.com/fact-check",
"taiwanese.facts.news",
"taiwanfactcheck.com",
"mygopen.com",
"tfc-taiwan.org.tw",
"cofacts.tw",
"rumor.taipei",
"fact.qq.com",
"factcheck.afp.com/list",
"acfta.org",
"crosscheck.firstdraftnews.org",
"healthfeedback.org",
"climatefeedback.org",
"sciencefeedback.co",
"factcheck.aap.com.au",
"emergent.info",
"hoax-slayer.net",
"truthorfiction.com",
"factcheck.media",
"mediawise.org",
"thejournal.ie/factcheck",
"journalistsresource.org",
"metafact.io",
"reporterslab.org/fact-checking"
]
"snopes.com",
"politifact.com",
"factcheck.org",
"reuters.com/fact-check",
"apnews.com/hub/ap-fact-check",
"bbc.com/news/reality_check",
"fullfact.org",
"afp.com/fact-check",
"truthorfiction.com",
"leadstories.com",
"checkyourfact.com",
"washingtonpost.com/news/fact-checker",
"factcheck.kz",
"poynter.org/ifcn",
"factcheckeu.info",
"africacheck.org",
"thequint.com/webqoof",
"altnews.in",
"facta.news",
"factcheckni.org",
"mythdetector.ge",
"verificado.mx",
"euvsdisinfo.eu",
"factcheck.afp.com",
"newtral.es",
"maldita.es",
"faktograf.hr",
"demagog.org.pl",
"factnameh.com",
"faktiskt.se",
"teyit.org",
"factly.in",
"boom.live",
"stopfake.org",
"factcheck.ge",
"factcheck.kg",
"factcheck.uz",
"factcheck.tj",
"factcheck.az",
"factcheck.am",
"factcheck.md",
"verafiles.org",
"rappler.com/fact-check",
"vera.com.gt",
"chequeado.com",
"aosfatos.org",
"lasillavacia.com/detector-mentiras",
"colombiacheck.com",
"ecuadorchequea.com",
"elsurti.com/checado",
"verificat.cat",
"mafindo.or.id",
"tempo.co/cek-fakta",
"factcheck.mk",
"raskrinkavanje.ba",
"faktograf.hr",
"demagog.cz",
"faktabaari.fi",
"correctiv.org",
"mimikama.at",
"factcheck.vlaanderen",
"factuel.afp.com",
"nieuwscheckers.nl",
"faktisk.no",
"tjekdet.dk",
"ellinikahoaxes.gr",
"faktograf.id",
"stopfake.kz",
"pesacheck.org",
"dubawa.org",
"namibiafactcheck.org.na",
"zimfact.org",
"ghanafact.com",
"factspace.africa",
"factcrescendo.com",
"vishvasnews.com",
"factcheck.lk",
"newschecker.in",
"boomlive.in",
"digiteye.in",
"indiatoday.in/fact-check",
"factcrescendo.com",
"piyasa.com/fact-check",
"taiwanese.facts.news",
"taiwanfactcheck.com",
"mygopen.com",
"tfc-taiwan.org.tw",
"cofacts.tw",
"rumor.taipei",
"fact.qq.com",
"factcheck.afp.com/list",
"acfta.org",
"crosscheck.firstdraftnews.org",
"healthfeedback.org",
"climatefeedback.org",
"sciencefeedback.co",
"factcheck.aap.com.au",
"emergent.info",
"hoax-slayer.net",
"truthorfiction.com",
"factcheck.media",
"mediawise.org",
"thejournal.ie/factcheck",
"journalistsresource.org",
"metafact.io",
"reporterslab.org/fact-checking",
]
],
"news_sites": [
FactCheckSource(domain=domain, type=SourceType.NEWS_SITE, priority=2)
@ -133,16 +138,14 @@ SOURCES = {
"www.risingbd.com/english",
"www.dailyindustry.news",
"www.bangladeshpost.net",
"www.daily-bangladesh.com/english"
"www.daily-bangladesh.com/english",
]
]
],
}
async def fetch_fact_checks(
api_key: str,
base_url: str,
query: str,
site: FactCheckSource
api_key: str, base_url: str, query: str, site: FactCheckSource
) -> Dict:
"""
Fetch fact checks from a specific site using the Google Fact Check API
@ -156,7 +159,7 @@ async def fetch_fact_checks(
"query": query,
"languageCode": "en-US",
"reviewPublisherSiteFilter": site.domain,
"pageSize": 10
"pageSize": 10,
}
response = requests.get(base_url, params=params)
@ -168,19 +171,18 @@ async def fetch_fact_checks(
detail=ErrorResponse(
detail=f"Error fetching from {site.domain}: {str(e)}",
error_code="FACT_CHECK_SERVICE_ERROR",
path="/check-facts"
).dict()
path="/check-facts",
).dict(),
)
except ValueError as e:
raise HTTPException(
status_code=500,
detail=ErrorResponse(
detail=str(e),
error_code="CONFIGURATION_ERROR",
path="/check-facts"
).dict()
detail=str(e), error_code="CONFIGURATION_ERROR", path="/check-facts"
).dict(),
)
def get_all_sources() -> List[FactCheckSource]:
"""
Get all sources sorted by priority

View file

@ -7,9 +7,7 @@ from app.config import FRONTEND_URL
# Initialize FastAPI app
app = FastAPI(
title="Your API Title",
description="Your API Description",
version="1.0.0"
title="Your API Title", description="Your API Description", version="1.0.0"
)
# CORS configuration
@ -30,16 +28,19 @@ app.add_middleware(
allow_headers=["*"],
)
# Basic root endpoint
@app.get("/")
async def root():
return {"message": "Welcome to your FastAPI application"}
# Health check endpoint
@app.get("/health")
async def health_check():
return {"status": "healthy"}
app.include_router(fact_check_router, prefix="")
app.include_router(aifact_check_router, prefix="")
app.include_router(scrap_websites_router, prefix="")
@ -50,4 +51,5 @@ app.include_router(scrap_websites_router, prefix="")
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

View file

@ -3,16 +3,19 @@ from main import app
client = TestClient(app)
def test_root_endpoint():
response = client.get("/")
assert response.status_code == 200
assert response.json() == {"message": "Welcome to your FastAPI application"}
def test_health_endpoint():
response = client.get("/health")
assert response.status_code == 200
assert response.json() == {"status": "healthy"}
def test_cors_headers():
response = client.get("/", headers={"Origin": "http://localhost:5173"})
assert response.headers["access-control-allow-origin"] == "http://localhost:5173"