fact check from image is functional

2024-12-19 16:37:57 +06:00 · 2024-12-19 16:37:57 +06:00 · 9298352f2e
commit 9298352f2e
parent a1a699f9b3
6 changed files with 376 additions and 73 deletions
--- a/app/api/pycache/fact_check.cpython-312.pyc
+++ b/app/api/pycache/fact_check.cpython-312.pyc
--- a/app/api/fact_check.py
+++ b/app/api/fact_check.py
@ -1,9 +1,13 @@
 from fastapi import APIRouter, HTTPException
 import httpx
-from typing import Union
+import asyncio
 import logging
 from typing import Union, Optional, Dict, Any
 from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
 from app.api.scrap_websites import search_websites, SearchRequest
-from app.services.openai_client import OpenAIClient
+from app.services.openai_client import OpenAIClient, AIFactChecker
 from app.services.image_text_extractor import ImageTextExtractor
 from app.models.ai_fact_check_models import AIFactCheckResponse
 from app.models.fact_check_models import (
    FactCheckRequest,
    FactCheckResponse,
@ -15,11 +19,91 @@ from app.models.fact_check_models import (
 )
 from app.websites.fact_checker_website import get_all_sources
 # Setup logging
 logger = logging.getLogger(__name__)
 fact_check_router = APIRouter()
 openai_client = OpenAIClient(OPENAI_API_KEY)
 ai_fact_checker = AIFactChecker(openai_client)
 image_text_extractor = ImageTextExtractor(OPENAI_API_KEY)
-async def generate_fact_report(query: str, fact_check_data: dict) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
+async def process_url_content(url: str) -> Optional[str]:
    """Extract text content from the provided URL."""
    try:
        # Add await here
        text = await image_text_extractor.extract_text(url, is_url=True)
        if text:
            logger.info(f"Successfully extracted text from URL: {text}")
        else:
            logger.warning(f"No text could be extracted from URL: {url}")
        return text
    except Exception as e:
        logger.error(f"Error extracting text from URL: {str(e)}")
        return None
 async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    """Process a single fact check query."""
    if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="The fact-checking service is not properly configured.",
            explanation="The system is missing required API configuration for fact-checking services.",
            additional_context="This is a temporary system configuration issue."
        )
    headers = {"Content-Type": "application/json"}
    async with httpx.AsyncClient() as client:
        fact_checker_sources = get_all_sources()
        for source in fact_checker_sources:
            params = {
                "key": GOOGLE_API_KEY,
                "query": query,
                "languageCode": "en-US",
                "reviewPublisherSiteFilter": source.domain,
                "pageSize": 10,
            }
            try:
                response = await client.get(
                    GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
                )
                response.raise_for_status()
                json_response = response.json()
                if json_response.get("claims"):
                    return await generate_fact_report(query, json_response)
            except Exception as e:
                logger.error(f"Error with source {source.domain}: {str(e)}")
                continue
        try:
            search_request = SearchRequest(
                search_text=query,
                source_types=["fact_checkers"]
            )
            ai_response = await search_websites(search_request)
            return await generate_fact_report(query, ai_response)
        except Exception as e:
            logger.error(f"Error in AI fact check: {str(e)}")
            return await generate_fact_report(query, {
                "status": "no_results",
                "verification_result": {
                    "no_sources_found": True,
                    "reason": str(e)
                }
            })
 async def generate_fact_report(query: str, fact_check_data: dict | AIFactCheckResponse) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    """Generate a fact check report using OpenAI based on the fact check results."""
    try:
        base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.
@ -31,12 +115,23 @@ Rules:
 4. Maintain objectivity in the report
 5. If no reliable sources are found, provide a clear explanation why"""
        # Handle both dictionary and AIFactCheckResponse
        if hasattr(fact_check_data, 'verification_result'):
            # It's an AIFactCheckResponse
            has_sources = bool(fact_check_data.sources)
            verification_result = fact_check_data.verification_result
            fact_check_data_dict = fact_check_data.dict()
        else:
            # It's a dictionary
            has_sources = bool(fact_check_data.get("claims") or fact_check_data.get("urls_found"))
            verification_result = fact_check_data.get("verification_result", {})
            fact_check_data_dict = fact_check_data
        # If no sources were found, return an unverified response
-        if not fact_check_data.get("claims") and (
+        if not has_sources or (
-            not fact_check_data.get("urls_found") or 
+            isinstance(fact_check_data, dict) and 
-            fact_check_data.get("status") == "no_results" or 
+            fact_check_data.get("status") == "no_results"
-            fact_check_data.get("verification_result", {}).get("no_sources_found")
+        ) or (verification_result and verification_result.get("no_sources_found")):
        ):
            return UnverifiedFactCheckResponse(
                claim=query,
                verdict=VerdictEnum.UNVERIFIED,
@ -63,10 +158,10 @@ Rules:
    "additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
 }"""
-        if "claims" in fact_check_data:
+        if isinstance(fact_check_data, dict) and "claims" in fact_check_data:
            system_prompt = base_system_prompt
            user_prompt = f"""Query: {query}
-            Fact Check Results: {fact_check_data}
+            Fact Check Results: {fact_check_data_dict}
            {base_user_prompt}
@ -75,11 +170,10 @@ Rules:
            2. Specify verification dates when available 
            3. Name the fact-checking organizations involved
            4. Describe the verification process"""
        else:
            system_prompt = base_system_prompt
            user_prompt = f"""Query: {query}
-            Fact Check Results: {fact_check_data}
+            Fact Check Results: {fact_check_data_dict}
            {base_user_prompt}
@ -116,7 +210,7 @@ Rules:
            return FactCheckResponse(**response_data)
        except Exception as validation_error:
-            print(f"Response validation error: {str(validation_error)}")
+            logger.error(f"Response validation error: {str(validation_error)}")
            return UnverifiedFactCheckResponse(
                claim=query,
                verdict=VerdictEnum.UNVERIFIED,
@ -128,7 +222,7 @@ Rules:
            )
    except Exception as e:
-        print(f"Error generating fact report: {str(e)}")
+        logger.error(f"Error generating fact report: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
@ -139,68 +233,137 @@ Rules:
            additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
        )
 async def combine_fact_reports(query: str, url_text: str, query_result: Dict[str, Any], url_result: Dict[str, Any]) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
    """Combine fact check results from query and URL into a single comprehensive report."""
    try:
        system_prompt = """You are a professional fact-checking reporter. Your task is to create a comprehensive fact check report by combining and analyzing multiple fact-checking results. Focus on accuracy, clarity, and proper citation of all sources.
 Rules:
 1. Include all source URLs and names from both result sets
 2. Compare and contrast findings from different sources
 3. Include dates when available
 4. Note any discrepancies between sources
 5. Provide a balanced, objective analysis"""
        user_prompt = f"""Original Query: {query}
 Extracted Text from URL: {url_text}
 First Fact Check Result: {query_result}
 Second Fact Check Result: {url_result}
 Generate a comprehensive fact check report in this exact JSON format:
 {{
    "claim": "Write the exact claim being verified",
    "verdict": "One of: True/False/Partially True/Unverified",
    "confidence": "One of: High/Medium/Low",
    "sources": [
        {{
            "url": "Full URL of the source",
            "name": "Name of the source organization"
        }}
    ],
    "evidence": "A concise summary of the key evidence from both sources (2-3 sentences)",
    "explanation": "A detailed explanation combining findings from both fact checks (3-4 sentences)",
    "additional_context": "Important context about differences or similarities in findings (1-2 sentences)"
 }}
 The report should:
 1. Combine sources from both fact checks
 2. Compare findings from both analyses
 3. Note any differences in conclusions
 4. Provide a unified verdict based on all available information"""
        response = await openai_client.generate_text_response(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            max_tokens=1000
        )
        response_data = response["response"]
        # Clean up sources from both results
        if isinstance(response_data.get("sources"), list):
            cleaned_sources = []
            for source in response_data["sources"]:
                if isinstance(source, str):
                    url = source if source.startswith("http") else f"https://{source}"
                    cleaned_sources.append({"url": url, "name": source})
                elif isinstance(source, dict):
                    url = source.get("url", "")
                    if url and not url.startswith("http"):
                        source["url"] = f"https://{url}"
                    cleaned_sources.append(source)
            response_data["sources"] = cleaned_sources
        if response_data["verdict"] == "Unverified" or not response_data.get("sources"):
            return UnverifiedFactCheckResponse(**response_data)
        return FactCheckResponse(**response_data)
    except Exception as e:
        logger.error(f"Error combining fact reports: {str(e)}")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
            confidence=ConfidenceEnum.LOW,
            sources=[],
            evidence="An error occurred while combining fact check reports.",
            explanation="The system encountered an error while trying to combine results from multiple sources.",
            additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
        )
@fact_check_router.post("/check-facts", response_model=Union[FactCheckResponse, UnverifiedFactCheckResponse])
 async def check_facts(request: FactCheckRequest):
    """
    Fetch fact check results and generate a comprehensive report.
    Handles both query-based and URL-based fact checking.
    """
-    if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
+    url_text = None
    query_result = None
    url_result = None
    # If URL is provided, try to extract text
    if request.url:
        url_text = await process_url_content(request.url)
        if not url_text and not request.query:
            # Only return early if URL text extraction failed and no query provided
            return UnverifiedFactCheckResponse(
-            claim=request.query,
+                claim=f"URL check requested: {request.url}",
                verdict=VerdictEnum.UNVERIFIED,
                confidence=ConfidenceEnum.LOW,
                sources=[],
-            evidence="The fact-checking service is not properly configured.",
+                evidence="Unable to extract text from the provided URL.",
-            explanation="The system is missing required API configuration for fact-checking services.",
+                explanation="The system could not process the content from the provided URL. The URL might be invalid or inaccessible.",
-            additional_context="This is a temporary system configuration issue."
+                additional_context="Please provide a valid URL or a text query for fact-checking."
            )
-    headers = {"Content-Type": "application/json"}
+        # If URL text was successfully extracted, process it
-    async with httpx.AsyncClient() as client:
+        if url_text:
-        fact_checker_sources = get_all_sources()
+            logger.info(f"Processing fact check for extracted text: {url_text}")
            url_result = await process_fact_check(url_text)
-        for source in fact_checker_sources:
+    # Process query if provided
-            params = {
+    if request.query:
-                "key": GOOGLE_API_KEY,
+        query_result = await process_fact_check(request.query)
                "query": request.query,
                "languageCode": "en-US",
                "reviewPublisherSiteFilter": source.domain,
                "pageSize": 10,
            }
-            try:
+    # If both results are available, combine them
-                response = await client.get(
+    if query_result and url_result and url_text:
-                    GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
+        return await combine_fact_reports(request.query, url_text, 
                                        query_result.dict(), url_result.dict())
    # If only one result is available
    if query_result:
        return query_result
    if url_result:
        return url_result
    # If no valid results
    return UnverifiedFactCheckResponse(
        claim=request.query or f"URL: {request.url}",
        verdict=VerdictEnum.UNVERIFIED,
        confidence=ConfidenceEnum.LOW,
        sources=[],
        evidence="Failed to process fact-checking request.",
        explanation="The system encountered errors while processing the fact checks.",
        additional_context="Please try again with different input or contact support if the issue persists."
    )
                response.raise_for_status()
                json_response = response.json()
                if json_response.get("claims"):
                    return await generate_fact_report(request.query, json_response)
            except httpx.RequestError as e:
                print(f"Error fetching results for site {source.domain}: {str(e)}")
                continue
            except Exception as e:
                print(f"Unexpected error for site {source.domain}: {str(e)}")
                continue
        try:
            search_request = SearchRequest(
                search_text=request.query,
                source_types=["fact_checkers"]
            )
            ai_response = await search_websites(search_request)
            return await generate_fact_report(request.query, ai_response)
        except Exception as e:
            print(f"Error in AI fact check: {str(e)}")
            return await generate_fact_report(request.query, {
                "status": "no_results",
                "verification_result": {
                    "no_sources_found": True,
                    "reason": str(e)
                }
            })
--- a/app/models/pycache/fact_check_models.cpython-312.pyc
+++ b/app/models/pycache/fact_check_models.cpython-312.pyc
--- a/app/models/fact_check_models.py
+++ b/app/models/fact_check_models.py
@ -1,5 +1,5 @@
-from pydantic import BaseModel, Field, HttpUrl, validator
+from pydantic import BaseModel, Field, HttpUrl, validator, root_validator
-from typing import List, Literal, Union
+from typing import List, Literal, Union, Optional
 from datetime import datetime
 from enum import Enum
@ -18,13 +18,34 @@ class ConfidenceEnum(str, Enum):
 class FactCheckRequest(BaseModel):
-    query: str = Field(
+    query: Optional[str] = Field(
-        ...,
+        None,
        min_length=3,
        max_length=500,
        description="The claim or statement to be fact-checked",
        example="Did NASA confirm finding alien structures on Mars in 2024?",
    )
    url: Optional[str] = Field(
        None,
        description="URL to be fact-checked",
        example="https://example.com/article",
    )
    @root_validator(pre=True)
    def validate_at_least_one(cls, values):
        """Validate that at least one of query or url is provided."""
        query = values.get('query')
        url = values.get('url')
        if not query and not url:
            raise ValueError("At least one of 'query' or 'url' must be provided")
        return values
    @validator('url')
    def validate_url(cls, v):
        """Validate URL format if provided."""
        if v is not None and len(v) < 3:
            raise ValueError("URL must be at least 3 characters")
        return v
 class Source(BaseModel):
--- a/app/services/image_text_extractor.py
+++ b/app/services/image_text_extractor.py
@ -0,0 +1,119 @@
 import base64
 import requests
 import os
 from io import BytesIO
 from typing import Tuple, Optional
 import logging
 import aiohttp
 logger = logging.getLogger(__name__)
 class ImageTextExtractor:
    def __init__(self, api_key: str):
        """Initialize ImageTextExtractor with OpenAI API key."""
        self.api_key = api_key
        self.api_url = "https://api.openai.com/v1/chat/completions"
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
    def encode_image(self, image_path: str) -> str:
        """Encode a local image into base64."""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            logger.error(f"Error encoding image: {str(e)}")
            raise Exception(f"Error encoding image: {e}")
    async def fetch_image_from_url(self, image_url: str) -> Tuple[str, str]:
        """Fetch an image from a URL and encode it as base64."""
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(image_url) as response:
                    if response.status != 200:
                        raise Exception(f"Failed to fetch image: Status {response.status}")
                    content_type = response.headers.get('Content-Type', '')
                    if "text/html" in content_type:
                        raise ValueError("The URL points to a webpage, not an image")
                    if "image" not in content_type:
                        raise ValueError("The URL does not point to a valid image")
                    image_data = await response.read()
                    image_format = "jpeg" if "jpeg" in content_type or "jpg" in content_type else "png"
                    base64_image = base64.b64encode(image_data).decode('utf-8')
                    return base64_image, image_format
        except aiohttp.ClientError as e:
            logger.error(f"Error fetching image from URL: {str(e)}")
            raise Exception(f"Error fetching image from URL: {e}")
        except ValueError as e:
            raise
        except Exception as e:
            logger.error(f"Unexpected error processing image URL: {str(e)}")
            raise Exception(f"Unexpected error processing image: {e}")
    async def extract_text(self, image_input: str, is_url: bool = False) -> Optional[str]:
        """Extract text from an image, either from a local path or URL."""
        try:
            if is_url:
                try:
                    base64_image, image_format = await self.fetch_image_from_url(image_input)
                except ValueError as e:
                    if "webpage" in str(e):
                        return None
                    raise
            else:
                if not os.path.exists(image_input):
                    raise FileNotFoundError(f"Image file not found: {image_input}")
                base64_image = self.encode_image(image_input)
                image_format = "jpeg" if image_input.endswith(".jpg") else "png"
            payload = {
                "model": "gpt-4-turbo-2024-04-09",  # Updated model name
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text", 
                                "text": "Extract and return only the key text from this image in the original language. Do not provide translations or explanations."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/{image_format};base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": 300
            }
            async with aiohttp.ClientSession() as session:
                async with session.post(self.api_url, headers=self.headers, json=payload) as response:
                    if response.status != 200:
                        error_content = await response.text()
                        logger.error(f"API request failed: Status {response.status}, Response: {error_content}")
                        raise Exception(f"API request failed with status {response.status}")
                    result = await response.json()
                    logger.debug(f"GPT-4 API Response: {result}")
                    if 'choices' in result and len(result['choices']) > 0:
                        extracted_text = result['choices'][0]['message']['content'].strip()
                        if extracted_text:
                            return extracted_text
                    return None
        except (aiohttp.ClientError, ValueError, FileNotFoundError) as e:
            logger.error(f"Error in text extraction: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error in text extraction: {str(e)}")
            return None
        return None
--- a/app/websites/pycache/fact_checker_website.cpython-312.pyc
+++ b/app/websites/pycache/fact_checker_website.cpython-312.pyc