fact-checker-backend/app/services/openai_client.py

from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List, Dict, Any
import numpy as np
import logging as logger
import openai
import json
import aiohttp
from bs4 import BeautifulSoup


class OpenAIClient:
    def __init__(self, api_key: str):
        """
        Initialize OpenAI client with the provided API key.
        """
        openai.api_key = api_key

    async def generate_text_response(
        self, system_prompt: str, user_prompt: str, max_tokens: int
    ) -> dict:
        """
        Generate a response using OpenAI's chat completion API.
        """
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                max_tokens=max_tokens,
            )
            content = response["choices"][0]["message"]["content"]
            # Parse the JSON string into a dictionary
            parsed_content = json.loads(content)

            return {
                "response": parsed_content,  # Now returns a dictionary instead of string
                "prompt_tokens": response["usage"]["prompt_tokens"],
                "completion_tokens": response["usage"]["completion_tokens"],
                "total_tokens": response["usage"]["total_tokens"],
            }
        except json.JSONDecodeError as e:
            raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}")
        except Exception as e:
            raise Exception(f"OpenAI text generation error: {str(e)}")

    def get_embeddings(self, texts: List[str]) -> List[List[float]]:
        """
        Retrieve embeddings for a list of texts using OpenAI's embedding API.
        """
        try:
            response = openai.Embedding.create(
                input=texts, model="text-embedding-ada-002"
            )
            embeddings = [data["embedding"] for data in response["data"]]
            return embeddings
        except Exception as e:
            raise Exception(f"OpenAI embedding error: {str(e)}")


class AIFactChecker:
    def __init__(self, openai_client: OpenAIClient):
        """Initialize the fact checker with OpenAI client."""
        self.openai_client = openai_client
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        )

    async def scrape_webpage(self, url: str) -> List[Document]:
        """Scrape webpage content without saving HTML files."""
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    if response.status != 200:
                        raise Exception(
                            f"Failed to fetch URL: {url}, status: {response.status}"
                        )

                    html_content = await response.text()

                    # Parse HTML with BeautifulSoup
                    soup = BeautifulSoup(html_content, "html.parser")

                    # Create a Document with the parsed content
                    doc = Document(
                        page_content=soup.get_text(separator="\n", strip=True),
                        metadata={"source": url},
                    )

                    # Split into chunks
                    docs_chunks = self.text_splitter.split_documents([doc])

                    logger.info(
                        f"Successfully scraped webpage | chunks={len(docs_chunks)}"
                    )
                    return docs_chunks

        except Exception as e:
            logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
            raise

    def find_relevant_chunks(
        self,
        query_embedding: List[float],
        doc_embeddings: List[List[float]],
        docs: List[Document],
    ) -> List[Document]:
        """Find most relevant document chunks using cosine similarity."""
        try:
            query_array = np.array(query_embedding)
            chunks_array = np.array(doc_embeddings)

            similarities = np.dot(chunks_array, query_array) / (
                np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array)
            )

            top_indices = np.argsort(similarities)[-5:][::-1]
            return [docs[i] for i in top_indices]

        except Exception as e:
            logger.error(f"Error finding relevant chunks | error={str(e)}")
            raise

    async def verify_fact(
        self, query: str, relevant_docs: List[Document]
    ) -> Dict[str, Any]:
        """Verify fact using OpenAI's API with context from relevant documents."""
        try:
            context = "\n\n".join([doc.page_content for doc in relevant_docs])

            system_prompt = """You are a professional fact-checking assistant. Analyze the provided context
            and determine if the given statement is true, false, or if there isn't enough information.

            Provide your response in the following JSON format:
            {
                "verdict": "True/False/Insufficient Information",
                "confidence": "High/Medium/Low",
                "evidence": "Direct quotes or evidence from the context",
                "reasoning": "Your detailed analysis and reasoning",
                "missing_info": "Any important missing information (if applicable)"
            }"""

            user_prompt = f"""Context:
            {context}

            Statement to verify: "{query}"

            Analyze the statement based on the provided context and return your response in the specified JSON format."""

            response = await self.openai_client.generate_text_response(
                system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=800
            )

            sources = list(
                set(
                    [
                        doc.metadata.get("source", "Unknown source")
                        for doc in relevant_docs
                    ]
                )
            )

            return {
                "verification_result": response["response"],  # This is now a dictionary
                "sources": sources,
                "token_usage": {
                    "prompt_tokens": response["prompt_tokens"],
                    "completion_tokens": response["completion_tokens"],
                    "total_tokens": response["total_tokens"],
                },
            }

        except Exception as e:
            logger.error(f"Error verifying fact | error={str(e)}")
            raise

    async def check_fact(self, url: str, query: str) -> Dict[str, Any]:
        """Main method to check a fact against a webpage."""
        try:
            docs = await self.scrape_webpage(url)

            doc_texts = [doc.page_content for doc in docs]
            doc_embeddings = self.openai_client.get_embeddings(doc_texts)
            query_embedding = self.openai_client.get_embeddings([query])

            relevant_docs = self.find_relevant_chunks(
                query_embedding[0], doc_embeddings, docs
            )
            verification_result = await self.verify_fact(query, relevant_docs)

            return verification_result

        except Exception as e:
            logger.error(f"Error checking fact | error={str(e)}")
            raise