added perplexity for checking fact

2025-04-09 17:02:56 +06:00 · 2025-04-09 17:02:56 +06:00 · f7b2d77ce4
commit f7b2d77ce4
parent afe5c1d576
9 changed files with 225 additions and 44 deletions
--- a/app/pycache/init.cpython-312.pyc
+++ b/app/pycache/init.cpython-312.pyc
--- a/app/pycache/config.cpython-312.pyc
+++ b/app/pycache/config.cpython-312.pyc
--- a/app/api/pycache/init.cpython-312.pyc
+++ b/app/api/pycache/init.cpython-312.pyc
--- a/app/api/pycache/fact_check.cpython-312.pyc
+++ b/app/api/pycache/fact_check.cpython-312.pyc
--- a/app/api/fact_check.py
+++ b/app/api/fact_check.py
@ -1,9 +1,12 @@
 from fastapi import APIRouter, HTTPException
-import httpx
 import asyncio
 import logging
+import httpx
+import json
+import re
 from typing import Union, Optional, Dict, Any
-from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY
+from datetime import datetime
+from app.config import OPENAI_API_KEY,PERPLEXITY_API_KEY
 from app.api.scrap_websites import search_websites, SearchRequest
 from app.services.openai_client import OpenAIClient, AIFactChecker
 from app.services.image_text_extractor import ImageTextExtractor
@ -12,12 +15,10 @@ from app.models.fact_check_models import (
    FactCheckRequest,
    FactCheckResponse,
    UnverifiedFactCheckResponse,
-    ErrorResponse,
    Source,
    VerdictEnum,
    ConfidenceEnum
 ) 
-from app.websites.fact_checker_website import get_all_sources

 # Setup logging
 logger = logging.getLogger(__name__)
@ -42,10 +43,11 @@ async def process_url_content(url: str) -> Optional[str]:
        logger.error(f"Error extracting text from URL: {str(e)}")
        return None

+# Assuming the enums and models like FactCheckResponse, VerdictEnum, etc., are already imported

 async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
-    """Process a single fact check query."""
-    if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL:
+    if not PERPLEXITY_API_KEY:
+        logger.error("Perplexity API key not configured")
        return UnverifiedFactCheckResponse(
            claim=query,
            verdict=VerdictEnum.UNVERIFIED,
@ -56,51 +58,229 @@ async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedF
            additional_context="This is a temporary system configuration issue."
        )

-    headers = {"Content-Type": "application/json"}
-    async with httpx.AsyncClient() as client:
-        fact_checker_sources = get_all_sources()
+    url = "https://api.perplexity.ai/chat/completions"
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+        "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
+    }

-        for source in fact_checker_sources:
-            params = {
-                "key": GOOGLE_API_KEY,
-                "query": query,
-                "languageCode": "en-US",
-                "reviewPublisherSiteFilter": source.domain,
-                "pageSize": 10,
+    payload = {
+        "model": "sonar",
+        "messages": [
+            {
+                "role": "system",
+                "content": (
+                    "You are a precise fact checker. Analyze the following claim and determine if it's true, false, or partially true. "
+                    "Provide a clear verdict, confidence level (HIGH, MEDIUM, LOW), and cite reliable sources. "
+                    "Format your response as JSON with fields: verdict, confidence, sources (array of URLs), "
+                    "evidence (key facts as a string), and explanation (detailed reasoning as a string)."
+                )
+            },
+            {
+                "role": "user",
+                "content": f"Fact check this claim: {query}"
+            }
+        ]
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            response = await client.post(url, headers=headers, json=payload)
+            response.raise_for_status()
+            result = response.json()
+            perplexity_response = result["choices"][0]["message"]["content"]
+
+            # Attempt to extract JSON
+            try:
+                parsed_data = json.loads(perplexity_response)
+            except json.JSONDecodeError:
+                match = re.search(r'\{.*\}', perplexity_response, re.DOTALL)
+                if match:
+                    parsed_data = json.loads(match.group(0))
+                else:
+                    parsed_data = extract_fact_check_info(perplexity_response)
+
+            verdict_mapping = {
+                "true": VerdictEnum.TRUE,
+                "false": VerdictEnum.FALSE,
+                "partially true": VerdictEnum.PARTIALLY_TRUE,
+                "partially false": VerdictEnum.PARTIALLY_TRUE,
+                "unverified": VerdictEnum.UNVERIFIED
            }

-            try:
-                response = await client.get(
-                    GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers
+            confidence_mapping = {
+                "high": ConfidenceEnum.HIGH,
+                "medium": ConfidenceEnum.MEDIUM,
+                "low": ConfidenceEnum.LOW
+            }
+
+            raw_verdict = parsed_data.get("verdict", "").lower()
+            verdict = verdict_mapping.get(raw_verdict, VerdictEnum.UNVERIFIED)
+
+            raw_confidence = parsed_data.get("confidence", "").lower()
+            confidence = confidence_mapping.get(raw_confidence, ConfidenceEnum.MEDIUM)
+
+            sources = [
+                Source(
+                    url=url,
+                    domain=extract_domain(url),
+                    title=f"Source from {extract_domain(url)}",
+                    publisher=extract_domain(url),
+                    date_published=None,
+                    snippet="Source cited by Perplexity AI"
                )
-                response.raise_for_status()
-                json_response = response.json()
+                for url in parsed_data.get("sources", [])
+            ]

-                if json_response.get("claims"):
-                    return await generate_fact_report(query, json_response)
+            # Convert evidence to string if it's not already
+            evidence = parsed_data.get("evidence", "")
+            if isinstance(evidence, dict):
+                # Convert dictionary evidence to string format
+                evidence_str = ""
+                for key, value in evidence.items():
+                    evidence_str += f"{key}: {value}\n"
+                evidence = evidence_str.strip()
            
-            except Exception as e:
-                logger.error(f"Error with source {source.domain}: {str(e)}")
-                continue
+            # Convert explanation to string if it's not already
+            explanation = parsed_data.get("explanation", "")
+            if isinstance(explanation, dict):
+                explanation_str = ""
+                for key, value in explanation.items():
+                    explanation_str += f"{key}: {value}\n"
+                explanation = explanation_str.strip()

-        try:
-            search_request = SearchRequest(
-                search_text=query,
-                source_types=["fact_checkers"]
+            return FactCheckResponse(
+                claim=query,
+                verdict=verdict,
+                confidence=confidence,
+                sources=sources,
+                evidence=evidence,
+                explanation=explanation,
+                additional_context=f"Fact checked using PlanPost AI on {datetime.now().strftime('%Y-%m-%d')}"
            )

-            ai_response = await search_websites(search_request)
-            return await generate_fact_report(query, ai_response)
+    except Exception as e:
+        logger.error(f"Fact check error: {str(e)}")
+        return UnverifiedFactCheckResponse(
+            claim=query,
+            verdict=VerdictEnum.UNVERIFIED,
+            confidence=ConfidenceEnum.LOW,
+            sources=[],
+            evidence=str(e),
+            explanation="Failed to contact Perplexity AI or parse its response.",
+            additional_context="Possible API issue or malformed response."
+        )

-        except Exception as e:
-            logger.error(f"Error in AI fact check: {str(e)}")
-            return await generate_fact_report(query, {
-                "status": "no_results",
-                "verification_result": {
-                    "no_sources_found": True,
-                    "reason": str(e)
-                }
-            })
+
+
+def extract_domain(url: str) -> str:
+    """Extract domain from URL.
+    
+    Args:
+        url: The URL to extract domain from
+        
+    Returns:
+        The domain name or "unknown" if parsing fails
+    """
+    try:
+        from urllib.parse import urlparse
+        parsed_url = urlparse(url)
+        domain = parsed_url.netloc
+        return domain if domain else "unknown"
+    except Exception as e:
+        logger.warning(f"Failed to extract domain from URL {url}: {str(e)}")
+        return "unknown"
+
+
+def extract_fact_check_info(text_response: str) -> Dict[str, Any]:
+    """Extract fact-checking information from a text response when JSON parsing fails.
+    
+    Args:
+        text_response: The text response from Perplexity AI
+        
+    Returns:
+        A dictionary with fact-checking information extracted from the text
+    """
+    import re
+    
+    result = {
+        "verdict": "unverified",
+        "confidence": "medium",
+        "sources": [],
+        "evidence": "",
+        "explanation": ""
+    }
+    
+    # Try to extract verdict with more comprehensive pattern matching
+    verdict_patterns = [
+        r'verdict[:\s]+(true|false|partially true|partially false|inconclusive|unverified)',
+        r'(true|false|partially true|partially false|inconclusive|unverified)[:\s]+verdict',
+        r'claim is (true|false|partially true|partially false|inconclusive|unverified)',
+        r'statement is (true|false|partially true|partially false|inconclusive|unverified)'
+    ]
+    
+    for pattern in verdict_patterns:
+        verdict_match = re.search(pattern, text_response.lower(), re.IGNORECASE)
+        if verdict_match:
+            result["verdict"] = verdict_match.group(1)
+            break
+    
+    # Try to extract confidence with multiple patterns
+    confidence_patterns = [
+        r'confidence[:\s]+(high|medium|low)',
+        r'(high|medium|low)[:\s]+confidence',
+        r'confidence level[:\s]+(high|medium|low)',
+        r'(high|medium|low)[:\s]+confidence level'
+    ]
+    
+    for pattern in confidence_patterns:
+        confidence_match = re.search(pattern, text_response.lower(), re.IGNORECASE)
+        if confidence_match:
+            result["confidence"] = confidence_match.group(1)
+            break
+    
+    # Try to extract URLs as sources - more robust pattern
+    urls = re.findall(r'https?://[^\s"\'\]\)]+', text_response)
+    # Filter out any malformed URLs
+    valid_urls = []
+    for url in urls:
+        if '.' in url and len(url) > 10:  # Basic validation
+            valid_urls.append(url)
+    result["sources"] = valid_urls
+    
+    # Try to extract evidence and explanation with multiple patterns
+    evidence_patterns = [
+        r'evidence[:\s]+(.*?)(?=explanation|\Z)',
+        r'key facts[:\s]+(.*?)(?=explanation|\Z)',
+        r'facts[:\s]+(.*?)(?=explanation|\Z)'
+    ]
+    
+    for pattern in evidence_patterns:
+        evidence_match = re.search(pattern, text_response, re.IGNORECASE | re.DOTALL)
+        if evidence_match:
+            result["evidence"] = evidence_match.group(1).strip()
+            break
+    
+    explanation_patterns = [
+        r'explanation[:\s]+(.*?)(?=\Z)',
+        r'reasoning[:\s]+(.*?)(?=\Z)',
+        r'analysis[:\s]+(.*?)(?=\Z)'
+    ]
+    
+    for pattern in explanation_patterns:
+        explanation_match = re.search(pattern, text_response, re.IGNORECASE | re.DOTALL)
+        if explanation_match:
+            result["explanation"] = explanation_match.group(1).strip()
+            break
+    
+    # If no structured information found, use the whole response as evidence
+    if not result["evidence"] and not result["explanation"]:
+        result["evidence"] = text_response
+        # Generate a minimal explanation if none was found
+        result["explanation"] = "The fact-checking service provided information about this claim but did not structure it in the expected format. The full response has been included as evidence for you to review."
+    
+    return result


 async def generate_fact_report(query: str, fact_check_data: dict | AIFactCheckResponse) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]:
--- a/app/config.py
+++ b/app/config.py
@ -7,6 +7,7 @@ GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
 GOOGLE_FACT_CHECK_BASE_URL = os.environ["GOOGLE_FACT_CHECK_BASE_URL"]
 GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"]
 GOOGLE_SEARCH_URL = os.environ["GOOGLE_SEARCH_URL"]
+PERPLEXITY_API_KEY= os.environ["PERPLEXITY_API_KEY"]

 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
 FRONTEND_URL = os.environ["FRONTEND_URL"]
--- a/app/models/pycache/fact_check_models.cpython-312.pyc
+++ b/app/models/pycache/fact_check_models.cpython-312.pyc
--- a/app/websites/pycache/fact_checker_website.cpython-312.pyc
+++ b/app/websites/pycache/fact_checker_website.cpython-312.pyc
--- a/requirements.txt
+++ b/requirements.txt
@ -43,7 +43,7 @@ mdurl==0.1.2
 multidict==6.1.0
 mypy-extensions==1.0.0
 numpy==1.26.4
-openai==0.28.0
+openai==1.23.6
 orjson==3.10.12
 packaging==24.2
 pathspec==0.12.1