fact check from image is functional
This commit is contained in:
parent
a1a699f9b3
commit
9298352f2e
6 changed files with 376 additions and 73 deletions
Binary file not shown.
|
|
@ -1,9 +1,13 @@
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
import httpx
|
import httpx
|
||||||
from typing import Union
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import Union, Optional, Dict, Any
|
||||||
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
|
from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
|
||||||
from app.api.scrap_websites import search_websites, SearchRequest
|
from app.api.scrap_websites import search_websites, SearchRequest
|
||||||
from app.services.openai_client import OpenAIClient
|
from app.services.openai_client import OpenAIClient, AIFactChecker
|
||||||
|
from app.services.image_text_extractor import ImageTextExtractor
|
||||||
|
from app.models.ai_fact_check_models import AIFactCheckResponse
|
||||||
from app.models.fact_check_models import (
|
from app.models.fact_check_models import (
|
||||||
FactCheckRequest,
|
FactCheckRequest,
|
||||||
FactCheckResponse,
|
FactCheckResponse,
|
||||||
|
|
@ -15,11 +19,91 @@ from app.models.fact_check_models import (
|
||||||
)
|
)
|
||||||
from app.websites.fact_checker_website import get_all_sources
|
from app.websites.fact_checker_website import get_all_sources
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
fact_check_router = APIRouter()
|
fact_check_router = APIRouter()
|
||||||
openai_client = OpenAIClient(OPENAI_API_KEY)
|
openai_client = OpenAIClient(OPENAI_API_KEY)
|
||||||
|
ai_fact_checker = AIFactChecker(openai_client)
|
||||||
|
image_text_extractor = ImageTextExtractor(OPENAI_API_KEY)
|
||||||
|
|
||||||
|
|
||||||
async def generate_fact_report(query: str, fact_check_data: dict) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
|
async def process_url_content(url: str) -> Optional[str]:
|
||||||
|
"""Extract text content from the provided URL."""
|
||||||
|
try:
|
||||||
|
# Add await here
|
||||||
|
text = await image_text_extractor.extract_text(url, is_url=True)
|
||||||
|
if text:
|
||||||
|
logger.info(f"Successfully extracted text from URL: {text}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"No text could be extracted from URL: {url}")
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting text from URL: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
|
||||||
|
"""Process a single fact check query."""
|
||||||
|
if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
|
||||||
|
return UnverifiedFactCheckResponse(
|
||||||
|
claim=query,
|
||||||
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
confidence=ConfidenceEnum.LOW,
|
||||||
|
sources=[],
|
||||||
|
evidence="The fact-checking service is not properly configured.",
|
||||||
|
explanation="The system is missing required API configuration for fact-checking services.",
|
||||||
|
additional_context="This is a temporary system configuration issue."
|
||||||
|
)
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
fact_checker_sources = get_all_sources()
|
||||||
|
|
||||||
|
for source in fact_checker_sources:
|
||||||
|
params = {
|
||||||
|
"key": GOOGLE_API_KEY,
|
||||||
|
"query": query,
|
||||||
|
"languageCode": "en-US",
|
||||||
|
"reviewPublisherSiteFilter": source.domain,
|
||||||
|
"pageSize": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.get(
|
||||||
|
GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
json_response = response.json()
|
||||||
|
|
||||||
|
if json_response.get("claims"):
|
||||||
|
return await generate_fact_report(query, json_response)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error with source {source.domain}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
search_request = SearchRequest(
|
||||||
|
search_text=query,
|
||||||
|
source_types=["fact_checkers"]
|
||||||
|
)
|
||||||
|
|
||||||
|
ai_response = await search_websites(search_request)
|
||||||
|
return await generate_fact_report(query, ai_response)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in AI fact check: {str(e)}")
|
||||||
|
return await generate_fact_report(query, {
|
||||||
|
"status": "no_results",
|
||||||
|
"verification_result": {
|
||||||
|
"no_sources_found": True,
|
||||||
|
"reason": str(e)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_fact_report(query: str, fact_check_data: dict | AIFactCheckResponse) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
|
||||||
"""Generate a fact check report using OpenAI based on the fact check results."""
|
"""Generate a fact check report using OpenAI based on the fact check results."""
|
||||||
try:
|
try:
|
||||||
base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.
|
base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources.
|
||||||
|
|
@ -31,12 +115,23 @@ Rules:
|
||||||
4. Maintain objectivity in the report
|
4. Maintain objectivity in the report
|
||||||
5. If no reliable sources are found, provide a clear explanation why"""
|
5. If no reliable sources are found, provide a clear explanation why"""
|
||||||
|
|
||||||
|
# Handle both dictionary and AIFactCheckResponse
|
||||||
|
if hasattr(fact_check_data, 'verification_result'):
|
||||||
|
# It's an AIFactCheckResponse
|
||||||
|
has_sources = bool(fact_check_data.sources)
|
||||||
|
verification_result = fact_check_data.verification_result
|
||||||
|
fact_check_data_dict = fact_check_data.dict()
|
||||||
|
else:
|
||||||
|
# It's a dictionary
|
||||||
|
has_sources = bool(fact_check_data.get("claims") or fact_check_data.get("urls_found"))
|
||||||
|
verification_result = fact_check_data.get("verification_result", {})
|
||||||
|
fact_check_data_dict = fact_check_data
|
||||||
|
|
||||||
# If no sources were found, return an unverified response
|
# If no sources were found, return an unverified response
|
||||||
if not fact_check_data.get("claims") and (
|
if not has_sources or (
|
||||||
not fact_check_data.get("urls_found") or
|
isinstance(fact_check_data, dict) and
|
||||||
fact_check_data.get("status") == "no_results" or
|
fact_check_data.get("status") == "no_results"
|
||||||
fact_check_data.get("verification_result", {}).get("no_sources_found")
|
) or (verification_result and verification_result.get("no_sources_found")):
|
||||||
):
|
|
||||||
return UnverifiedFactCheckResponse(
|
return UnverifiedFactCheckResponse(
|
||||||
claim=query,
|
claim=query,
|
||||||
verdict=VerdictEnum.UNVERIFIED,
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
|
@ -63,10 +158,10 @@ Rules:
|
||||||
"additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
|
"additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)"
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
if "claims" in fact_check_data:
|
if isinstance(fact_check_data, dict) and "claims" in fact_check_data:
|
||||||
system_prompt = base_system_prompt
|
system_prompt = base_system_prompt
|
||||||
user_prompt = f"""Query: {query}
|
user_prompt = f"""Query: {query}
|
||||||
Fact Check Results: {fact_check_data}
|
Fact Check Results: {fact_check_data_dict}
|
||||||
|
|
||||||
{base_user_prompt}
|
{base_user_prompt}
|
||||||
|
|
||||||
|
|
@ -75,11 +170,10 @@ Rules:
|
||||||
2. Specify verification dates when available
|
2. Specify verification dates when available
|
||||||
3. Name the fact-checking organizations involved
|
3. Name the fact-checking organizations involved
|
||||||
4. Describe the verification process"""
|
4. Describe the verification process"""
|
||||||
|
|
||||||
else:
|
else:
|
||||||
system_prompt = base_system_prompt
|
system_prompt = base_system_prompt
|
||||||
user_prompt = f"""Query: {query}
|
user_prompt = f"""Query: {query}
|
||||||
Fact Check Results: {fact_check_data}
|
Fact Check Results: {fact_check_data_dict}
|
||||||
|
|
||||||
{base_user_prompt}
|
{base_user_prompt}
|
||||||
|
|
||||||
|
|
@ -116,7 +210,7 @@ Rules:
|
||||||
return FactCheckResponse(**response_data)
|
return FactCheckResponse(**response_data)
|
||||||
|
|
||||||
except Exception as validation_error:
|
except Exception as validation_error:
|
||||||
print(f"Response validation error: {str(validation_error)}")
|
logger.error(f"Response validation error: {str(validation_error)}")
|
||||||
return UnverifiedFactCheckResponse(
|
return UnverifiedFactCheckResponse(
|
||||||
claim=query,
|
claim=query,
|
||||||
verdict=VerdictEnum.UNVERIFIED,
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
|
@ -128,7 +222,7 @@ Rules:
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error generating fact report: {str(e)}")
|
logger.error(f"Error generating fact report: {str(e)}")
|
||||||
return UnverifiedFactCheckResponse(
|
return UnverifiedFactCheckResponse(
|
||||||
claim=query,
|
claim=query,
|
||||||
verdict=VerdictEnum.UNVERIFIED,
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
|
@ -139,68 +233,137 @@ Rules:
|
||||||
additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
|
additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def combine_fact_reports(query: str, url_text: str, query_result: Dict[str, Any], url_result: Dict[str, Any]) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
|
||||||
|
"""Combine fact check results from query and URL into a single comprehensive report."""
|
||||||
|
try:
|
||||||
|
system_prompt = """You are a professional fact-checking reporter. Your task is to create a comprehensive fact check report by combining and analyzing multiple fact-checking results. Focus on accuracy, clarity, and proper citation of all sources.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
1. Include all source URLs and names from both result sets
|
||||||
|
2. Compare and contrast findings from different sources
|
||||||
|
3. Include dates when available
|
||||||
|
4. Note any discrepancies between sources
|
||||||
|
5. Provide a balanced, objective analysis"""
|
||||||
|
|
||||||
|
user_prompt = f"""Original Query: {query}
|
||||||
|
Extracted Text from URL: {url_text}
|
||||||
|
|
||||||
|
First Fact Check Result: {query_result}
|
||||||
|
Second Fact Check Result: {url_result}
|
||||||
|
|
||||||
|
Generate a comprehensive fact check report in this exact JSON format:
|
||||||
|
{{
|
||||||
|
"claim": "Write the exact claim being verified",
|
||||||
|
"verdict": "One of: True/False/Partially True/Unverified",
|
||||||
|
"confidence": "One of: High/Medium/Low",
|
||||||
|
"sources": [
|
||||||
|
{{
|
||||||
|
"url": "Full URL of the source",
|
||||||
|
"name": "Name of the source organization"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"evidence": "A concise summary of the key evidence from both sources (2-3 sentences)",
|
||||||
|
"explanation": "A detailed explanation combining findings from both fact checks (3-4 sentences)",
|
||||||
|
"additional_context": "Important context about differences or similarities in findings (1-2 sentences)"
|
||||||
|
}}
|
||||||
|
|
||||||
|
The report should:
|
||||||
|
1. Combine sources from both fact checks
|
||||||
|
2. Compare findings from both analyses
|
||||||
|
3. Note any differences in conclusions
|
||||||
|
4. Provide a unified verdict based on all available information"""
|
||||||
|
|
||||||
|
response = await openai_client.generate_text_response(
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
user_prompt=user_prompt,
|
||||||
|
max_tokens=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
response_data = response["response"]
|
||||||
|
|
||||||
|
# Clean up sources from both results
|
||||||
|
if isinstance(response_data.get("sources"), list):
|
||||||
|
cleaned_sources = []
|
||||||
|
for source in response_data["sources"]:
|
||||||
|
if isinstance(source, str):
|
||||||
|
url = source if source.startswith("http") else f"https://{source}"
|
||||||
|
cleaned_sources.append({"url": url, "name": source})
|
||||||
|
elif isinstance(source, dict):
|
||||||
|
url = source.get("url", "")
|
||||||
|
if url and not url.startswith("http"):
|
||||||
|
source["url"] = f"https://{url}"
|
||||||
|
cleaned_sources.append(source)
|
||||||
|
response_data["sources"] = cleaned_sources
|
||||||
|
|
||||||
|
if response_data["verdict"] == "Unverified" or not response_data.get("sources"):
|
||||||
|
return UnverifiedFactCheckResponse(**response_data)
|
||||||
|
return FactCheckResponse(**response_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error combining fact reports: {str(e)}")
|
||||||
|
return UnverifiedFactCheckResponse(
|
||||||
|
claim=query,
|
||||||
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
confidence=ConfidenceEnum.LOW,
|
||||||
|
sources=[],
|
||||||
|
evidence="An error occurred while combining fact check reports.",
|
||||||
|
explanation="The system encountered an error while trying to combine results from multiple sources.",
|
||||||
|
additional_context="This is a technical error and does not reflect on the truthfulness of the claim."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@fact_check_router.post("/check-facts", response_model=Union[FactCheckResponse, UnverifiedFactCheckResponse])
|
@fact_check_router.post("/check-facts", response_model=Union[FactCheckResponse, UnverifiedFactCheckResponse])
|
||||||
async def check_facts(request: FactCheckRequest):
|
async def check_facts(request: FactCheckRequest):
|
||||||
"""
|
"""
|
||||||
Fetch fact check results and generate a comprehensive report.
|
Fetch fact check results and generate a comprehensive report.
|
||||||
|
Handles both query-based and URL-based fact checking.
|
||||||
"""
|
"""
|
||||||
if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
|
url_text = None
|
||||||
|
query_result = None
|
||||||
|
url_result = None
|
||||||
|
|
||||||
|
# If URL is provided, try to extract text
|
||||||
|
if request.url:
|
||||||
|
url_text = await process_url_content(request.url)
|
||||||
|
if not url_text and not request.query:
|
||||||
|
# Only return early if URL text extraction failed and no query provided
|
||||||
return UnverifiedFactCheckResponse(
|
return UnverifiedFactCheckResponse(
|
||||||
claim=request.query,
|
claim=f"URL check requested: {request.url}",
|
||||||
verdict=VerdictEnum.UNVERIFIED,
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
confidence=ConfidenceEnum.LOW,
|
confidence=ConfidenceEnum.LOW,
|
||||||
sources=[],
|
sources=[],
|
||||||
evidence="The fact-checking service is not properly configured.",
|
evidence="Unable to extract text from the provided URL.",
|
||||||
explanation="The system is missing required API configuration for fact-checking services.",
|
explanation="The system could not process the content from the provided URL. The URL might be invalid or inaccessible.",
|
||||||
additional_context="This is a temporary system configuration issue."
|
additional_context="Please provide a valid URL or a text query for fact-checking."
|
||||||
)
|
)
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
# If URL text was successfully extracted, process it
|
||||||
async with httpx.AsyncClient() as client:
|
if url_text:
|
||||||
fact_checker_sources = get_all_sources()
|
logger.info(f"Processing fact check for extracted text: {url_text}")
|
||||||
|
url_result = await process_fact_check(url_text)
|
||||||
|
|
||||||
for source in fact_checker_sources:
|
# Process query if provided
|
||||||
params = {
|
if request.query:
|
||||||
"key": GOOGLE_API_KEY,
|
query_result = await process_fact_check(request.query)
|
||||||
"query": request.query,
|
|
||||||
"languageCode": "en-US",
|
|
||||||
"reviewPublisherSiteFilter": source.domain,
|
|
||||||
"pageSize": 10,
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
# If both results are available, combine them
|
||||||
response = await client.get(
|
if query_result and url_result and url_text:
|
||||||
GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
|
return await combine_fact_reports(request.query, url_text,
|
||||||
|
query_result.dict(), url_result.dict())
|
||||||
|
|
||||||
|
# If only one result is available
|
||||||
|
if query_result:
|
||||||
|
return query_result
|
||||||
|
if url_result:
|
||||||
|
return url_result
|
||||||
|
|
||||||
|
# If no valid results
|
||||||
|
return UnverifiedFactCheckResponse(
|
||||||
|
claim=request.query or f"URL: {request.url}",
|
||||||
|
verdict=VerdictEnum.UNVERIFIED,
|
||||||
|
confidence=ConfidenceEnum.LOW,
|
||||||
|
sources=[],
|
||||||
|
evidence="Failed to process fact-checking request.",
|
||||||
|
explanation="The system encountered errors while processing the fact checks.",
|
||||||
|
additional_context="Please try again with different input or contact support if the issue persists."
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
|
||||||
json_response = response.json()
|
|
||||||
|
|
||||||
if json_response.get("claims"):
|
|
||||||
return await generate_fact_report(request.query, json_response)
|
|
||||||
|
|
||||||
except httpx.RequestError as e:
|
|
||||||
print(f"Error fetching results for site {source.domain}: {str(e)}")
|
|
||||||
continue
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Unexpected error for site {source.domain}: {str(e)}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
search_request = SearchRequest(
|
|
||||||
search_text=request.query,
|
|
||||||
source_types=["fact_checkers"]
|
|
||||||
)
|
|
||||||
|
|
||||||
ai_response = await search_websites(search_request)
|
|
||||||
return await generate_fact_report(request.query, ai_response)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in AI fact check: {str(e)}")
|
|
||||||
return await generate_fact_report(request.query, {
|
|
||||||
"status": "no_results",
|
|
||||||
"verification_result": {
|
|
||||||
"no_sources_found": True,
|
|
||||||
"reason": str(e)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Binary file not shown.
|
|
@ -1,5 +1,5 @@
|
||||||
from pydantic import BaseModel, Field, HttpUrl, validator
|
from pydantic import BaseModel, Field, HttpUrl, validator, root_validator
|
||||||
from typing import List, Literal, Union
|
from typing import List, Literal, Union, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
|
@ -18,13 +18,34 @@ class ConfidenceEnum(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
class FactCheckRequest(BaseModel):
|
class FactCheckRequest(BaseModel):
|
||||||
query: str = Field(
|
query: Optional[str] = Field(
|
||||||
...,
|
None,
|
||||||
min_length=3,
|
min_length=3,
|
||||||
max_length=500,
|
max_length=500,
|
||||||
description="The claim or statement to be fact-checked",
|
description="The claim or statement to be fact-checked",
|
||||||
example="Did NASA confirm finding alien structures on Mars in 2024?",
|
example="Did NASA confirm finding alien structures on Mars in 2024?",
|
||||||
)
|
)
|
||||||
|
url: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="URL to be fact-checked",
|
||||||
|
example="https://example.com/article",
|
||||||
|
)
|
||||||
|
|
||||||
|
@root_validator(pre=True)
|
||||||
|
def validate_at_least_one(cls, values):
|
||||||
|
"""Validate that at least one of query or url is provided."""
|
||||||
|
query = values.get('query')
|
||||||
|
url = values.get('url')
|
||||||
|
if not query and not url:
|
||||||
|
raise ValueError("At least one of 'query' or 'url' must be provided")
|
||||||
|
return values
|
||||||
|
|
||||||
|
@validator('url')
|
||||||
|
def validate_url(cls, v):
|
||||||
|
"""Validate URL format if provided."""
|
||||||
|
if v is not None and len(v) < 3:
|
||||||
|
raise ValueError("URL must be at least 3 characters")
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
class Source(BaseModel):
|
class Source(BaseModel):
|
||||||
|
|
|
||||||
119
app/services/image_text_extractor.py
Normal file
119
app/services/image_text_extractor.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
import base64
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
import logging
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class ImageTextExtractor:
|
||||||
|
def __init__(self, api_key: str):
|
||||||
|
"""Initialize ImageTextExtractor with OpenAI API key."""
|
||||||
|
self.api_key = api_key
|
||||||
|
self.api_url = "https://api.openai.com/v1/chat/completions"
|
||||||
|
self.headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {api_key}"
|
||||||
|
}
|
||||||
|
|
||||||
|
def encode_image(self, image_path: str) -> str:
|
||||||
|
"""Encode a local image into base64."""
|
||||||
|
try:
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error encoding image: {str(e)}")
|
||||||
|
raise Exception(f"Error encoding image: {e}")
|
||||||
|
|
||||||
|
async def fetch_image_from_url(self, image_url: str) -> Tuple[str, str]:
|
||||||
|
"""Fetch an image from a URL and encode it as base64."""
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(image_url) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
raise Exception(f"Failed to fetch image: Status {response.status}")
|
||||||
|
|
||||||
|
content_type = response.headers.get('Content-Type', '')
|
||||||
|
if "text/html" in content_type:
|
||||||
|
raise ValueError("The URL points to a webpage, not an image")
|
||||||
|
if "image" not in content_type:
|
||||||
|
raise ValueError("The URL does not point to a valid image")
|
||||||
|
|
||||||
|
image_data = await response.read()
|
||||||
|
image_format = "jpeg" if "jpeg" in content_type or "jpg" in content_type else "png"
|
||||||
|
base64_image = base64.b64encode(image_data).decode('utf-8')
|
||||||
|
return base64_image, image_format
|
||||||
|
|
||||||
|
except aiohttp.ClientError as e:
|
||||||
|
logger.error(f"Error fetching image from URL: {str(e)}")
|
||||||
|
raise Exception(f"Error fetching image from URL: {e}")
|
||||||
|
except ValueError as e:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error processing image URL: {str(e)}")
|
||||||
|
raise Exception(f"Unexpected error processing image: {e}")
|
||||||
|
|
||||||
|
async def extract_text(self, image_input: str, is_url: bool = False) -> Optional[str]:
|
||||||
|
"""Extract text from an image, either from a local path or URL."""
|
||||||
|
try:
|
||||||
|
if is_url:
|
||||||
|
try:
|
||||||
|
base64_image, image_format = await self.fetch_image_from_url(image_input)
|
||||||
|
except ValueError as e:
|
||||||
|
if "webpage" in str(e):
|
||||||
|
return None
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
if not os.path.exists(image_input):
|
||||||
|
raise FileNotFoundError(f"Image file not found: {image_input}")
|
||||||
|
base64_image = self.encode_image(image_input)
|
||||||
|
image_format = "jpeg" if image_input.endswith(".jpg") else "png"
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "gpt-4-turbo-2024-04-09", # Updated model name
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Extract and return only the key text from this image in the original language. Do not provide translations or explanations."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/{image_format};base64,{base64_image}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"max_tokens": 300
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(self.api_url, headers=self.headers, json=payload) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
error_content = await response.text()
|
||||||
|
logger.error(f"API request failed: Status {response.status}, Response: {error_content}")
|
||||||
|
raise Exception(f"API request failed with status {response.status}")
|
||||||
|
|
||||||
|
result = await response.json()
|
||||||
|
logger.debug(f"GPT-4 API Response: {result}")
|
||||||
|
|
||||||
|
if 'choices' in result and len(result['choices']) > 0:
|
||||||
|
extracted_text = result['choices'][0]['message']['content'].strip()
|
||||||
|
if extracted_text:
|
||||||
|
return extracted_text
|
||||||
|
return None
|
||||||
|
|
||||||
|
except (aiohttp.ClientError, ValueError, FileNotFoundError) as e:
|
||||||
|
logger.error(f"Error in text extraction: {str(e)}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error in text extraction: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
Binary file not shown.
Loading…
Add table
Reference in a new issue