From 790d58402a05801b292d06d96f5f537ac3866ac0 Mon Sep 17 00:00:00 2001 From: Utsho Dey Date: Sun, 15 Dec 2024 18:22:04 +0600 Subject: [PATCH] base code added updated --- app/__pycache__/config.cpython-312.pyc | Bin 507 -> 576 bytes .../__pycache__/fact_check.cpython-312.pyc | Bin 5851 -> 6289 bytes app/api/fact_check.py | 124 ++-- app/api/scrap_websites.py | 533 ++++++++++-------- app/config.py | 1 + .../fact_check_models.cpython-312.pyc | Bin 9422 -> 9422 bytes app/models/scrap_websites_models.py | 43 ++ 7 files changed, 390 insertions(+), 311 deletions(-) create mode 100644 app/models/scrap_websites_models.py diff --git a/app/__pycache__/config.cpython-312.pyc b/app/__pycache__/config.cpython-312.pyc index 91b068866373f8d9210744c00633448a6572698b..22a68c5e11d594851135abcde42a1e5f1ce57cb2 100644 GIT binary patch delta 120 zcmey(e1L`bG%qg~0}%A=h)Xw{$ScXXXQKN2iGJeT?2kCF&czSZzzD>}F_V`w8VTGG7M*T2(dvSn T!9`)i3k(92Z!*fU@B-xlDnA{D delta 72 zcmX@W@|&6WG%qg~0}y=7k4$Ho$ScV>Z=(8qmP$rV_K7c?CfhK|F>*|fX4IS9$LPYv X2UN!h#Kk_7?=TuomSd7*;Q?|1xONcE diff --git a/app/api/__pycache__/fact_check.cpython-312.pyc b/app/api/__pycache__/fact_check.cpython-312.pyc index 7304a380d4674133d015ebad37508110a0b5cb53..98c2526f8272844d81c35b9bb2a51494f76a6e41 100644 GIT binary patch literal 6289 zcmbVQU2qdew(c3tjQ%x}C0Vv5+cGvb#(xCNPaq)}+t>zTCpJ)vvbj@2Gqz>)_}NEq4q*u?hO>X3#pI-s`h~_Y?W`_`@mz!vK3PI#lqS3?5uAkTbqYX)qN1)6olP} zdwQgiO_uCKTBV-Tf2U7(pFZb1{Xb5p4MBSA zjJT3~#GUj+JjsekMbaDb>TTwPFX@l?la-N53eiZSZk9xe1ZXsXM9XO;S|!ueCPUQK z?@d#Yj#P`b(i2TP}l-$V)EKExE@T(Q}$9%MqHT zTPwaM>{@Bf+Vk4bXgZ@vSaS{@I`mrq$(S^)#M7zaR!XyoQAJYXNeMbKeesy0F$d$a z0wveL$jHFKegQ@c2l|g{HKp>tp58-3?_huL0by^?Sif+1^k6HiRSu-n6A5WwG^X@U zO0nak(oZsytZ0FMsF$bHDOu9k-b6H>)SUeor}3y%sx<>&+v$TKb}u}NVZ zmmNhqx*jgh6qxMoTpo6@X;5KBa5SrB9*bU1wK>5i_+bW-ZdDDosMZni5b z%G2>gB8t1nhU%x(6`h`je_XFAZgJ=IEQYIw=Boc;dpb)qd!bq00NImMXxZ*RAG@;S zb9X*x6swmrOrnS(HOoq!1O+(+6N| zKBRNh>Jf`3kr7#uyTat)u%`@TWek9Cmdet%%!c~B3+7Ab&y@I^vZg5_?{|_R5e4&$ zAd^UyInd8aNTFHFc69Ct{5Qbg+K7}guPH~FEE7!#pUg2?X3ABrDic%gGFc`>>v7Nl z+@cV5(35CW{AJoMIz;EHp_(E(^_^s_TV{n!+F3kt%Lt6CwkXb}3 z$GpwVP&4SH=_s0^8qo{rU9x|0qQ5ke6WESeP`DXzBu7C{U@xg+&)=A4sMdOq8%KbbHeqEC*_^86veSgjUlabtd#0F zJoYtF>{doI!C;LAyNPEtM>gdu7rKH?|znR*TljSLI@qoX6E ztxk=dRFvtHn!QJ!O~rZ>aVe!>avQJ(GR*|$Tyx`ST$Thdb3$=GjVFF;OrKl#51Iv+ zJOqA$j3pNjlUuJbBJo)!S;1gmxR}mgvyjadGB|;WmBKqnM@o{3K@F3n+YrrNT81Qm z1r%WK@owl)1%J67cJ;5synV&{miTa<4=?iT9-~^;RUp~5@d6#Px|VCY7pk|;zqt_T zJu~=KV7*#huZCOHQ0)UNTj@Hpui!v+O;;oDM{ZbdS6+@R*6vvf>^U>2I($ox^?Aqo z&mAqQH}E(B9Ch9g2=6C_3RKlYJt=sh@a;+!a{HI~raa&D5P7UFm3O~4cXn=xU!UjK z-|*h&+m|aF)o|l-aD!UEOKt2{Yui@LOsMj)1qFf^#EauAPII85fXtqX6*uyR)lmI% zph>Om{_nBdp8q;BKe!NlS*_oyR=2>gstOq94P0ot*bKwGuvoLx^|yi_CVu+3`WZ3* zE)#3=PIX_9`he=$%v~`7b(QMlOxHK|?4a-1sh)1`j>AXj20OIfX{UO&F?TvRLU&W7 zej7#T9o{|@`-#10OYd&>x4T&=+%=hi&s~n|t1{npP<_1PuG36tpBV<=Hkepr!F@=H zRwjO)+=8EzYy~ph3XKH382UoojFix;!i2nl>D|(0t zl%wEJuUb#ct-61?MJYRdky6YCCy^!|;1+8DwdZ+&vi;-M*gw0$!Dt1~U|#EaD5Mb0 z*OSpFX95Ws__OfmvZg-tCN$;D;G@Wj^t3Afs`TKQo@3dw=9?wYR-Rk8iQx4tq9toN zfp9hp6fwy#6EMzM*VyfF9A&M)qW**WXKT*c04|liMM*^XWNn6Z2dHh$+D?EKKDRk* z6Rl)6(MIx|U9@NIAVr<%$O293pcDY#^*oNH9-@QvTf^N+cmVfkmi?Y_&SyuQJHeoH zzhS_04wZ%HtSyH@Xva# z_R5;dqdBqeis@GUP2yxA7}}1`Q|GSh`8c4Vl8z$;$mgY;Bg;Kk3(QlMMILMO zHE6^!MX>32k|Brax5YRZk$r=A82fO5rqSdCrW^x1Hzd zb5@uUs8!4p18hUIWCrP&reSWUjrc}AE&#U<;2@lVU}PekgkU2+osjg(4jnv@>QVrv z55hgLgt-vG2C;N%JU)@ZMR*XG!^yZz*l6C~k>Pzq1BXY~-~-*3<8~N~Nw7RjJUBr~ znqB{nN+;sd3?{%_vrcD@CF1fVL_mxjS0v4;NGBDD8%ANK)CAakt|*bl>Of|$`n4!l z;?YDHf=gI|9PT**SS5NaAa)j+!>yu+r#)(GLGIE#=~TD+`}WnX#d#ZgNMe1eItj5 z`ye=UPGeZ5A}A!HmNieTxMl!tMwLTJE73%$L`D?LrGwRit8q8TEs#x@AySiYToN@` z5%?Klc&h~ybD&ieSrktZ5FIRlI9z=_niuvcJ|2(hsv#6>HM23D12&EvsgQxa*X+93 zKm+iFKAB`Pif2DUt$rOh77=6_1>zTBZXLa0;?6Mfd74@02WXSTIvNL&t^~qxVmcX( zr|?eFk04sjT7D1FObIEaF;i)1Cqa$@Vv(~Y=!EQ=#_D^i@dj8GNt_Q4C(uqVhh`!8 zS^$;DFO#tX8B2}hW28WUuS_Cja}HNS7zwd@M2Rl#L|l7X#iaFgn91mx&*0dnNMXT^UbfRncYaCh^k%4U3Ga|o^SZFw(;6vzP95bvU&DX=lhqN+pec>NY_(K%`fJg zUtDb7b$&$k)i3#)^S5P4vts@|Z!40ezcv9fR?HseHi3po)BoJb|| z)~b~OwI-$fg8Y+J0|zTDP%Dxpw@)1cD&!QTphanqpQOE!eVvD1J=~&T0tf! z|BVGX{eN5LtG;*4>mO8pw|l`3{Qv%NBU%@szLg2^d8daR>Zb0Pseuj59ZNN#*EJ4w z(x0%@KpXc7w;Sl+vYjOFu7v!qtp`D;-`RXaUB2Jd?X^SU(>4lPKJBy)wX>gYe_^nb zy|;mb`g@%YsJXYZWvGSytbx>g*1|%~XYCfCTWdA*coZU92*}~(s2H_^c>zCAeB0xc z*=ew>n7j*O5;@>4kZCqBRh_yQnOHS;T8Px@!HnFwYM6`Rjf}}Nx#nJDuJqSeOo9n~ zoMglZ>Jf9%vNaHHGtx2LsITECVWCv38&Tby!ZR$G!Q$(nfQt{w_BF4j#m6`jJHxv{ z8X|FGNU%sU{TYrt*gdtpRDAE-jZZ_7gk18kAuE_DiuwYz{|VXu7u9}&YX5{BU!b1H zG)K|jAjlrO5cN9sXJ75b0}IHr!Z1|RWoD^rL%wRm3WCBzM?K{#?1guD?-CWrQvsE! zP`yoGy8P#J3(obR*3bPygwAsXGoq^=AuCN6d{Y$VTd40UAf4#$f2b3W{LuJ7Z`9ZF E|J2kYFaQ7m literal 5851 zcmb_geQX>@6`#G^`~LQQ*||G=&mR|G(%4CpG|dNb9mn~o9lO{Gs3)OibGx?J*}XHf zdv@%eX#^D{B7hN;1T~06RBBLyWC{N0pA?X&)FAolpgS!_6j2HI?~)j$3jD#Fy<4C2 z(MllZbh9&W-n@D5&6}Cu%m*Hi6G8gM)R!iAG$Zs^QZbUFUU(3q5W0eR#8VQIsSG8X zGA4@DO_DidhOb$oWjaIaG+MIA){IrQWo&xgBH3j}#vwa1PQ7lGT(Udkk-Zt8?9ce+ zKqeqJWt#Lhn-r8onUEaLg!Q^zipbGSRBp~RQ^Q-H)Op6H}N4(<=#5)D^^d3Xy z#`l)FR?ftPmg)Tsx>4d?f{%9#0g-w}?zu+a_8Jm+pWxyBf^UN618>k98YJdT>85`W zacPt0=zrzpSiYbNSo54XefpKb^I2g=74x}~G^N>jP8C#97J!jHDrQxU9u^f9YTo0c zqsNB_S?J9U4Zf;vsn?J751eKPP7DqVu}AvH2iY@Y!)c2aIiAl?O2RQNs}4*F*>hvU zR|%Tw;@sgE3jW~Cdl zPcEP}S9mFV?ZvyU&Pp(H<#XTr-1qxd-0f?=(51OcuU}PGeH|(vSJE6ea^d6q_(e@??HUZRvf@GXf(XF;^@VgB?p7OA#=$U2=zQ%Wn!q!h2rAh5D_@+CJQsZmH$Mn_)%_tdde2@?E z;cKR6k3CJ6{u7Ch5a008|HIhLe9JYS)HT!Xt+(p?>tdi77)SfilIb$rfzXnBPa{5{*z39j+5pUQCaXo<#8=Fm%o13yoC+Mx$80D@dd)a>O8n9OD!GS_GB0E4@Ldm%QG z*Jqv4XPp7S1F@hm6Zt}p*Zd=+?AYM=nc>sp?6J`^BS#mSPv(@u#Dtg?g`CQq%+;_d zOtxko&d)9EJ;Esh&%oSsvoNLH*?LUXq$_$PHKNBDZK125=S31SoWy{eM37&LAeozI z&I(LU5O{$H{~RC&I`t^iO~gkC0bm-Z-1UHCMNkc5%L9`w�_QZXSvj$U;qm zD^WgYklbos<)nHIrp!~T*JEFcHG%`=v-+h2@G2+?3fVLZDb8z6wNACjWNXd?#HjmH z3vM=vtx+|bF@Q@~jV+K{5l{=V%;EW;p>zVtbpeQm^a79El{(}z~rm2#_}>J!YnB&#sr+$gaGG$s$i>d^iNEF( zf#0C$&4yF_n=XN@F3F=CdR|q)Qv$D9Yn&ROapGjnqPt%6&2o}RF1d$tka3 zfXaU-m!xVO1;ZDHD$$k|I`mH~a&P%V^5B{`R%wo{1(`~8ZzZ;~67E>H(IM|cJMslD z@!y(Q_t<>SDzZ78>md|uUv26zBve9?3#UG6O58_rOZcL-YKl3+Yq96n{H<&LZ54m? zitl^Aia+weVTpSmB1^=3@mST1{LNQiDEX2fB|5I3C?$5>N6x^@)Y9NuYWt1cP2ona zoZ4SX?O#nDTpF!}+RC9+DU`bY`rXjJsueYN-SmAJ-B(HOTuzjdeZQhg$s<>-m00R# zsubI~{DpUCe|4_ZH&)s^R*64Xj_)qTcP}4%_h4zy@%OEzJ)?KyuT)~&Zccm{+f#|B z%kiD1_|E0XyS~z{6Mu-GtcHmupH!n@&Hcm39j%&Djzb?OwpHRCKNN}^}o zVs7FNO{GlI=ue79B z!`oK<+aWqWPB3o=SGxADCSO=h>|5L3bMv*;?FYfP0q45U<+olqQQd;vp$nr`)(r9Y zXnh;2+!|L7NGpzMtIV#nr(suOfw^r-J!O#YxFdnUbS7<3NQoC8BE@aFoCU_ z6ZZ6;8m?-b2wI#_^+yV&r*T5no-GtiURE{#rkz=TuEQ=O^vfh8r-go?)Hd-Xk!C!p zZIUNoB41qIe=n(Dktmz%pw&E4w=DpfZ!1lL`5rfQzh8>kvF`wxZ>%8pnvA~ qp2Om^R&B`C`~W#jrfTSfi3+W>^;VHy*fV%vFFXiC<2}7mx8gtZe#IgH diff --git a/app/api/fact_check.py b/app/api/fact_check.py index c5f494b..432f0de 100644 --- a/app/api/fact_check.py +++ b/app/api/fact_check.py @@ -2,7 +2,7 @@ from fastapi import APIRouter, HTTPException import json from datetime import datetime from typing import Dict, List - +import httpx from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL from app.models.fact_check_models import ( GoogleFactCheckRequest as FactCheckRequest, @@ -12,7 +12,6 @@ from app.models.fact_check_models import ( TokenUsage ) from app.websites.fact_checker_website import fetch_fact_checks, get_all_sources -from app.api.scrap_websites import SearchRequest, search_websites fact_check_router = APIRouter() @@ -22,6 +21,39 @@ class CustomJSONEncoder(json.JSONEncoder): return obj.isoformat() return super().default(obj) +async def validate_api_key(): + """Validate the Google API key with a test request""" + async with httpx.AsyncClient() as client: + try: + test_url = f"{GOOGLE_FACT_CHECK_BASE_URL}claims:search" + params = { + "key": GOOGLE_API_KEY, + "query": "test", + "languageCode": "en-US", + "pageSize": 1 + } + response = await client.get(test_url, params=params) + response.raise_for_status() + return True + except httpx.HTTPStatusError as e: + if e.response.status_code == 403: + raise HTTPException( + status_code=503, + detail=ErrorResponse( + detail="Invalid or expired API key", + error_code="INVALID_API_KEY", + path="/check-facts" + ).dict() + ) + raise HTTPException( + status_code=503, + detail=ErrorResponse( + detail=f"API validation failed: {str(e)}", + error_code="API_VALIDATION_ERROR", + path="/check-facts" + ).dict() + ) + @fact_check_router.post( "/check-facts", response_model=FactCheckResponse, @@ -34,7 +66,7 @@ class CustomJSONEncoder(json.JSONEncoder): ) async def check_facts(request: FactCheckRequest) -> FactCheckResponse: """ - Check facts using multiple fact-checking sources and fallback to web search + Check facts using multiple fact-checking sources """ all_results = [] verified_results = [] @@ -50,10 +82,14 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: ).dict() ) + # Validate API key before proceeding + await validate_api_key() + # Get all sources in priority order all_sources = get_all_sources() all_sources_list = [] # To store source URLs contexts_used = [] # To store context snippets + failed_sources = [] # Track failed sources for source in all_sources: try: @@ -78,75 +114,39 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: if "textualRating" in review: contexts_used.append(review["textualRating"]) - except HTTPException: + except HTTPException as http_err: + failed_sources.append({ + "source": source.domain, + "error": str(http_err.detail) + }) continue except Exception as e: - # Log the error but continue with other sources - print(f"Error processing {source.domain}: {str(e)}") + failed_sources.append({ + "source": source.domain, + "error": str(e) + }) continue - # If no results found, try searching websites - if not all_results: - try: - # Create search request - search_request = SearchRequest( - search_text=request.content, - source_types=["fact_checkers"] - ) - - # Perform website search - search_response = await search_websites(search_request) - - # If AI fact check results are available, use them - if search_response.ai_fact_check_result: - # Create a claim from AI fact check result - ai_claim = { - "text": request.content, - "claimant": "AI Analysis", - "claimDate": datetime.now().isoformat(), - "claimReview": [{ - "publisher": { - "name": "AI Fact Checker", - "site": "ai-fact-check" - }, - "textualRating": search_response.ai_fact_check_result.verification_result["verdict"], - "title": "AI Fact Check Analysis", - "reviewDate": datetime.now().isoformat(), - "url": "" - }] - } - - validated_claim = Claim(**ai_claim).dict() - all_results.append(validated_claim) - - # Add sources and contexts - all_sources_list.extend(search_response.results.keys()) - if search_response.ai_fact_check_result.verification_result["evidence"]: - contexts_used.extend(search_response.ai_fact_check_result.verification_result["evidence"]) - - except Exception as e: - print(f"Error during website search: {str(e)}") - - # If still no results found after searching websites - if not all_results: + # Return partial results if some sources failed but we have data + if all_results: + verification_result = { + "verdict": "Partial Results Available" if failed_sources else "Complete Results", + "confidence": "Medium" if failed_sources else "High", + "evidence": contexts_used, + "reasoning": "Based on available fact checks", + "missing_info": f"{len(failed_sources)} sources failed" if failed_sources else None + } + else: raise HTTPException( status_code=404, detail=ErrorResponse( - detail="No fact check results found", + detail="No fact check results found. Failed sources: " + + ", ".join([f"{f['source']}: {f['error']}" for f in failed_sources]), error_code="NO_RESULTS_FOUND", path="/check-facts" ).dict() ) - # Prepare the verification result - verification_result = { - "verdict": "Insufficient Information", # Default verdict - "confidence": "Low", - "evidence": contexts_used, - "reasoning": "Based on available fact checks and web search results", - "missing_info": "Additional verification may be needed" - } - # Create token usage information token_usage = TokenUsage( prompt_tokens=0, @@ -161,10 +161,12 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: results=all_results, verification_result=verification_result, sources=list(set(all_sources_list)), + context_used=contexts_used, token_usage=token_usage, summary={ "total_sources": len(set(all_sources_list)), - "fact_checking_sites_queried": len(all_sources) + "fact_checking_sites_queried": len(all_sources), + "failed_sources": failed_sources } ) diff --git a/app/api/scrap_websites.py b/app/api/scrap_websites.py index 93fb31a..0dd584c 100644 --- a/app/api/scrap_websites.py +++ b/app/api/scrap_websites.py @@ -1,309 +1,342 @@ from fastapi import APIRouter, HTTPException -from pydantic import BaseModel -from typing import List, Dict, Optional -from urllib.parse import urlencode, urlparse -import urllib.parse -import numpy as np -from time import sleep +import httpx import logging -import requests -from bs4 import BeautifulSoup -import re +from urllib.parse import urlparse +import json from app.services.openai_client import OpenAIClient -from app.config import OPENAI_API_KEY +from app.config import OPENAI_API_KEY, GOOGLE_API_KEY, GOOGLE_ENGINE_ID from app.websites.fact_checker_website import SOURCES, get_all_sources from app.api.ai_fact_check import ai_fact_check +from typing import List, Dict, Optional +from pydantic import BaseModel from app.models.fact_check_models import ( - AIFactCheckRequest, - AIFactCheckResponse, - VerificationResult, - TokenUsage + AIFactCheckRequest, + FactCheckSource, + SourceType ) +# Define Pydantic models +class Publisher(BaseModel): + name: str + site: str + +class ClaimReview(BaseModel): + publisher: Publisher + textualRating: str + +class Claim(BaseModel): + claimReview: List[ClaimReview] + claimant: str + text: str + +class Summary(BaseModel): + fact_checking_sites_queried: int + total_sources: int + +class VerificationResult(BaseModel): + verdict: str + confidence: str + evidence: List[str] + reasoning: str + fact_check_type: str + +class SearchRequest(BaseModel): + search_text: str + source_types: List[str] + +class EnhancedFactCheckResponse(BaseModel): + query: str + results: List[Dict] + sources: List + summary: Summary + token_usage: Dict[str, int] + total_claims_found: int + verification_result: VerificationResult + # Configure logging logging.basicConfig( - level=logging.INFO, + level=logging.INFO, # Changed back to INFO from DEBUG format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) scrap_websites_router = APIRouter() -# Configuration for scraping -MAX_RETRIES = 2 -RETRY_DELAY = 2 +# Constants +RESULTS_PER_PAGE = 10 +MAX_PAGES = 5 +MAX_URLS_PER_DOMAIN = 5 +GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1" -class SearchRequest(BaseModel): - search_text: str - source_types: List[str] = ["fact_checkers"] - -class UrlSimilarityInfo(BaseModel): - url: str - similarity: float - extracted_text: str - -class SearchResponse(BaseModel): - results: Dict[str, List[str]] - error_messages: Dict[str, str] - ai_fact_check_result: Optional[Dict] = None - -def extract_url_text(url: str) -> str: - """Extract and process meaningful text from URL path with improved cleaning""" - logger.debug(f"Extracting text from URL: {url}") +def get_domain_from_url(url: str) -> str: + """Extract domain from URL with improved handling.""" try: - parsed = urllib.parse.urlparse(url) - path = parsed.path - path = path.replace('.html', '').replace('/index', '').replace('.php', '') - segments = [seg for seg in path.split('/') if seg] - cleaned_segments = [] - for segment in segments: - segment = segment.replace('-', ' ').replace('_', ' ') - if not (segment.replace(' ', '').isdigit() or - all(part.isdigit() for part in segment.split() if part)): - cleaned_segments.append(segment) - - common_words = { - 'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk', - 'updates', 'update', 'latest', 'breaking', 'new', 'article' - } - - text = ' '.join(cleaned_segments) - words = [word.lower() for word in text.split() - if word.lower() not in common_words and len(word) > 1] - - result = ' '.join(words) - logger.debug(f"Extracted text: {result}") - return result + parsed = urlparse(url) + domain = parsed.netloc.lower() + # Remove 'www.' if present + if domain.startswith('www.'): + domain = domain[4:] + return domain except Exception as e: - logger.error(f"Error extracting text from URL {url}: {str(e)}") - return '' + logger.error(f"Error extracting domain from URL {url}: {str(e)}") + return "" -def extract_search_results(html_content): - """Extract URLs using multiple selectors and patterns""" - soup = BeautifulSoup(html_content, 'html.parser') - urls = set() # Using set to avoid duplicates +def is_valid_source_domain(domain: str, sources: List[FactCheckSource]) -> bool: + """Check if domain matches any source with improved matching logic.""" + if not domain: + return False - # Multiple CSS selectors to try - selectors = [ - 'div.g div.yuRUbf > a', # Main result links - 'div.g a.l', # Alternative link format - 'div.rc a', # Another possible format - 'div[class*="g"] > div > div > div > a', # Broader match - 'a[href^="http"]' # Any http link - ] + domain = domain.lower() + if domain.startswith('www.'): + domain = domain[4:] - for selector in selectors: - try: - elements = soup.select(selector) - for element in elements: - url = element.get('href') - if url and url.startswith('http') and not url.startswith('https://www.google.com'): - urls.add(url) - except Exception as e: - logger.debug(f"Error with selector {selector}: {str(e)}") + for source in sources: + source_domain = source.domain.lower() + if source_domain.startswith('www.'): + source_domain = source_domain[4:] + + # Check exact match + if domain == source_domain: + logger.debug(f"Exact domain match found: {domain} = {source_domain}") + return True + + # Check if domain ends with source domain + if domain.endswith('.' + source_domain): + logger.debug(f"Subdomain match found: {domain} ends with {source_domain}") + return True - # Also try finding URLs in the raw HTML using regex - url_pattern = r'href="(https?://[^"]+)"' - raw_urls = re.findall(url_pattern, html_content) - for url in raw_urls: - if not url.startswith('https://www.google.com'): - urls.add(url) - - return list(urls) + logger.debug(f"No match found for domain: {domain}") + return False -def google_search_scraper(search_text: str, site_domain: str, retry_count: int = 0) -> List[str]: - """Scrape Google search results with multiple query formats""" - logger.info(f"Searching for '{search_text}' on domain: {site_domain}") +async def build_enhanced_search_query(query: str, sources: List[FactCheckSource]) -> str: + """Build search query with site restrictions.""" + site_queries = [f"site:{source.domain}" for source in sources] + site_restriction = " OR ".join(site_queries) + enhanced_query = f"({query}) ({site_restriction})" + logger.debug(f"Enhanced search query: {enhanced_query}") + return enhanced_query + +async def google_custom_search(query: str, sources: List[FactCheckSource], page: int = 1) -> Optional[Dict]: + """Perform Google Custom Search with enhanced query.""" + enhanced_query = await build_enhanced_search_query(query, sources) + start_index = ((page - 1) * RESULTS_PER_PAGE) + 1 - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Referer': 'https://www.google.com/', - 'DNT': '1' + params = { + "key": GOOGLE_API_KEY, + "cx": GOOGLE_ENGINE_ID, + "q": enhanced_query, + "num": RESULTS_PER_PAGE, + "start": start_index } - # Try different query formats - query_formats = [ - f"{search_text} site:{site_domain}", - f"site:{site_domain} {search_text}", - f"\"{search_text}\" site:{site_domain}" - ] - - all_urls = set() - - for query in query_formats: + async with httpx.AsyncClient(timeout=30.0) as client: try: - google_url = f"https://www.google.com/search?q={urlencode({'q': query})}" - logger.debug(f"Trying query format: {query}") + logger.info(f"Making API request to Google Custom Search with params: {params}") + response = await client.get(GOOGLE_SEARCH_URL, params=params) + response.raise_for_status() - response = requests.get(google_url, headers=headers) + data = response.json() - if response.status_code == 200: - urls = extract_search_results(response.text) - domain_urls = [url for url in urls if site_domain in urlparse(url).netloc] - all_urls.update(domain_urls) - else: - logger.warning(f"Received status code {response.status_code} for query format: {query}") + search_info = data.get('searchInformation', {}) + logger.info(f"Search info: Total results: {search_info.get('totalResults', 0)}, " + f"Time taken: {search_info.get('searchTime', 0)}s") - sleep(2) # Delay between requests + if 'error' in data: + error_details = data['error'] + logger.error(f"API Error: {error_details}") + raise HTTPException( + status_code=response.status_code, + detail=f"Google API Error: {error_details.get('message')}" + ) + + return data except Exception as e: - logger.error(f"Error processing query format '{query}': {str(e)}") - if retry_count < MAX_RETRIES: - sleep(RETRY_DELAY) - return google_search_scraper(search_text, site_domain, retry_count + 1) + logger.error(f"Search error: {str(e)}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") + +async def analyze_fact_check_results(openai_client: OpenAIClient, original_response: Dict) -> Dict: + """Analyze fact check results using OpenAI to generate a consolidated verdict.""" - valid_urls = list(all_urls) - logger.info(f"Found {len(valid_urls)} unique URLs for domain: {site_domain}") - return valid_urls[:5] # Return up to 5 URLs - -def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float: - """Calculate cosine similarity between two embeddings""" - query_array = np.array(query_embedding) - url_array = np.array(url_embedding) + # Extract verification results from sources + verification_results = [] + for url, result in original_response.get('verification_result', {}).items(): + verification_results.append(f""" + Source: {url} + Verdict: {result.get('verdict')} + Confidence: {result.get('confidence')} + Evidence: {result.get('evidence')} + Reasoning: {result.get('reasoning')} + """) - similarity = np.dot(url_array, query_array) / ( - np.linalg.norm(url_array) * np.linalg.norm(query_array) - ) - return float(similarity) + system_prompt = """You are a professional fact-checking analyzer. Your task is to analyze multiple fact-checking results + and provide a consolidated verdict. Respond with a valid JSON object containing your analysis.""" + + user_prompt = f""" + Analyze these fact-checking results and provide a final verdict. + + Query: {original_response.get('query', '')} + + Fact Check Results: + {'\n'.join(verification_results)}""" + try: + logger.info("Generating AI analysis of fact check results") + response = await openai_client.generate_text_response( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=2000 + ) + + # Create the enhanced result structure + enhanced_result = { + "query": original_response.get('query', ''), + "results": [ + { + "claimReview": [ + { + "publisher": { + "name": source, + "site": source + }, + "textualRating": result.get('verdict', '') + } for source in original_response.get('sources', []) + ], + "claimant": "source", + "text": original_response.get('query', '') + } + ], + "sources": original_response.get('sources', []), + "summary": { + "fact_checking_sites_queried": len(original_response.get('sources', [])), + "total_sources": len(original_response.get('verification_result', {})) + }, + "verification_result": { + "verdict": next(iter(original_response.get('verification_result', {}).values()), {}).get('verdict', ''), + "confidence": next(iter(original_response.get('verification_result', {}).values()), {}).get('confidence', ''), + "evidence": [next(iter(original_response.get('verification_result', {}).values()), {}).get('evidence', '')], + "reasoning": next(iter(original_response.get('verification_result', {}).values()), {}).get('reasoning', ''), + "fact_check_type": "ai fact checker" + }, + "token_usage": original_response.get('token_usage', { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + }) + } + + enhanced_result["total_claims_found"] = len(enhanced_result.get("results", [])) + + logger.info("Successfully generated AI analysis") + return enhanced_result -@scrap_websites_router.post("/search", response_model=SearchResponse) + except Exception as e: + logger.error(f"Error in OpenAI analysis: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error in fact check analysis: {str(e)}") +@scrap_websites_router.post("/search", response_model=EnhancedFactCheckResponse) async def search_websites(request: SearchRequest): logger.info(f"Starting search with query: {request.search_text}") logger.info(f"Source types requested: {request.source_types}") - results = {} - error_messages = {} - - # Initialize OpenAI client - logger.debug("Initializing OpenAI client") - openai_client = OpenAIClient(OPENAI_API_KEY) - - # Get domains based on requested source types - domains = [] + # Get sources for requested types + selected_sources = [] for source_type in request.source_types: if source_type in SOURCES: - domains.extend([source.domain for source in SOURCES[source_type]]) + selected_sources.extend(SOURCES[source_type]) - if not domains: - logger.warning("No valid source types provided. Using all available domains.") - domains = [source.domain for source in get_all_sources()] + if not selected_sources: + logger.warning("No valid source types provided. Using all available sources.") + selected_sources = get_all_sources() - logger.info(f"Processing {len(domains)} domains") + logger.info(f"Selected sources: {[source.domain for source in selected_sources]}") - # Enhance search text with key terms - search_context = request.search_text - logger.debug("Getting query embedding from OpenAI") - query_embedding = openai_client.get_embeddings([search_context])[0] + # Initialize collections for URLs + all_urls = [] + domain_results = {} - # Higher similarity threshold for better filtering - SIMILARITY_THRESHOLD = 0.75 - MAX_URLS_PER_DOMAIN = 2 # Adjusted to ensure total stays under 5 - TOTAL_MAX_URLS = 5 # Maximum URLs allowed for AIFactCheckRequest - - total_urls_collected = 0 - for domain in domains[:3]: # Limit to 3 domains for testing - if total_urls_collected >= TOTAL_MAX_URLS: - break + try: + # Search and collect URLs + for page in range(1, MAX_PAGES + 1): + if len(all_urls) >= 50: + logger.info("Reached maximum URL limit of 50") + break - logger.info(f"Processing domain: {domain}") - try: - urls = google_search_scraper(request.search_text, domain) - valid_urls = [] + logger.info(f"Fetching page {page} of search results") + search_response = await google_custom_search(request.search_text, selected_sources, page) - logger.debug(f"Found {len(urls)} URLs for domain {domain}") + if not search_response or not search_response.get("items"): + logger.warning(f"No results found on page {page}") + break - for url in urls: - if len(valid_urls) >= MAX_URLS_PER_DOMAIN or total_urls_collected >= TOTAL_MAX_URLS: - break - - url_text = extract_url_text(url) - - if not url_text: - logger.debug(f"No meaningful text extracted from URL: {url}") + for item in search_response.get("items", []): + url = item.get("link") + if not url: continue - logger.debug("Getting URL embedding from OpenAI") - url_embedding = openai_client.get_embeddings([url_text])[0] - similarity = calculate_similarity(query_embedding, url_embedding) + domain = get_domain_from_url(url) + logger.debug(f"Processing URL: {url} with domain: {domain}") - logger.debug(f"Similarity score for {url}: {similarity}") + if is_valid_source_domain(domain, selected_sources): + if domain not in domain_results: + domain_results[domain] = [] + + if len(domain_results[domain]) < MAX_URLS_PER_DOMAIN: + domain_results[domain].append({ + "url": url, + "title": item.get("title", ""), + "snippet": item.get("snippet", "") + }) + all_urls.append(url) + else: + logger.debug(f"Skipping URL {url} - domain not in allowed list") - if similarity >= SIMILARITY_THRESHOLD: - valid_urls.append(url) - total_urls_collected += 1 - - results[domain] = valid_urls - logger.info(f"Successfully processed domain {domain}. Found {len(valid_urls)} valid URLs") - - except HTTPException as e: - logger.error(f"HTTP Exception for domain {domain}: {str(e.detail)}") - error_messages[domain] = str(e.detail) - except Exception as e: - logger.error(f"Unexpected error for domain {domain}: {str(e)}") - error_messages[domain] = f"Unexpected error for {domain}: {str(e)}" + if len(all_urls) >= 50: + break - sleep(1) # Add delay between processing different domains - - logger.info("Search completed") - logger.debug(f"Results found for {len(results)} domains") - logger.debug(f"Errors encountered for {len(error_messages)} domains") - - # Collect all valid URLs from results - all_valid_urls = [] - for domain_urls in results.values(): - all_valid_urls.extend(domain_urls) - - logger.info(f"Total valid URLs collected: {len(all_valid_urls)}") - - # Create request body for AI fact check - if all_valid_urls: + logger.info(f"Total URLs collected: {len(all_urls)}") + + if not all_urls: + return EnhancedFactCheckResponse( + query=request.search_text, + results=[], + sources=[], + summary=Summary( + fact_checking_sites_queried=len(selected_sources), + total_sources=0 + ), + token_usage={ + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + }, + total_claims_found=0, + verification_result=VerificationResult( + verdict="Insufficient Evidence", + confidence="Low", + evidence=["No relevant sources found"], + reasoning="No fact-checking sources were found for this claim", + fact_check_type="ai fact checker" + ) + ) + + # Perform fact check with collected URLs fact_check_request = AIFactCheckRequest( content=request.search_text, - urls=all_valid_urls[:TOTAL_MAX_URLS] # Ensure we don't exceed the limit + urls=all_urls[:5] # Limit to 5 URLs ) - logger.info("Calling AI fact check service") - try: - ai_response = await ai_fact_check(fact_check_request) - logger.info("AI fact check completed successfully") - - # Format AI fact check response - formatted_response = { - "query": ai_response.query, - "token_usage": { - "prompt_tokens": ai_response.token_usage.prompt_tokens, - "completion_tokens": ai_response.token_usage.completion_tokens, - "total_tokens": ai_response.token_usage.total_tokens - }, - "sources": ai_response.sources, - "verification_result": { - url: { - "verdict": result.verdict, - "confidence": result.confidence, - "evidence": result.evidence, - "reasoning": result.reasoning, - "missing_info": result.missing_info - } for url, result in ai_response.verification_result.items() - } - } - - # Return response with AI fact check results - return SearchResponse( - results=results, - error_messages=error_messages, - ai_fact_check_result=formatted_response - ) - - except Exception as e: - logger.error(f"Error during AI fact check: {str(e)}") - error_messages["ai_fact_check"] = f"Error during fact checking: {str(e)}" - - # Return response without AI fact check if no valid URLs or error occurred - return SearchResponse( - results=results, - error_messages=error_messages, - ai_fact_check_result=None - ) \ No newline at end of file + logger.info(f"Performing fact check with {len(fact_check_request.urls)} URLs") + fact_check_response = await ai_fact_check(fact_check_request) + + # Get enhanced analysis + openai_client = OpenAIClient(OPENAI_API_KEY) + enhanced_response = await analyze_fact_check_results( + openai_client, + fact_check_response.dict() + ) + + return EnhancedFactCheckResponse(**enhanced_response) + + except Exception as e: + logger.error(f"Error during search/fact-check process: {str(e)}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/app/config.py b/app/config.py index a13fd4d..8b60dd0 100644 --- a/app/config.py +++ b/app/config.py @@ -5,6 +5,7 @@ load_dotenv() GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"] GOOGLE_FACT_CHECK_BASE_URL= os.environ["GOOGLE_FACT_CHECK_BASE_URL"] +GOOGLE_ENGINE_ID = os.environ["GOOGLE_ENGINE_ID"] OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] FRONTEND_URL = os.environ["FRONTEND_URL"] \ No newline at end of file diff --git a/app/models/__pycache__/fact_check_models.cpython-312.pyc b/app/models/__pycache__/fact_check_models.cpython-312.pyc index 6df8e42d278ee3c78aa5452c872101d3e81faf2c..694cd33e5de5f8ed1537243806a1ba949ad2ec85 100644 GIT binary patch delta 20 acmX@-dCrsjG%qg~0}z;QkK4$7SOow>dIlB% delta 20 acmX@-dCrsjG%qg~0}vEv#BSt1tO5W-O$G!2 diff --git a/app/models/scrap_websites_models.py b/app/models/scrap_websites_models.py new file mode 100644 index 0000000..1c629c5 --- /dev/null +++ b/app/models/scrap_websites_models.py @@ -0,0 +1,43 @@ +from pydantic import BaseModel +from typing import List, Dict + +class SearchRequest(BaseModel): + search_text: str + source_types: List[str] = ["fact_checkers"] + +class Publisher(BaseModel): + name: str + site: str + +class ClaimReview(BaseModel): + publisher: Publisher + textualRating: str + +class Claim(BaseModel): + claimReview: List[ClaimReview] + claimant: str + text: str + +class Summary(BaseModel): + fact_checking_sites_queried: int + total_sources: int + +class TokenUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + +class VerificationResult(BaseModel): + verdict: str + confidence: str + evidence: List[str] + reasoning: str + +class EnhancedFactCheckResponse(BaseModel): + query: str + results: List[Claim] + sources: List[str] + summary: Summary + token_usage: Dict[str, int] + total_claims_found: int + verification_result: VerificationResult \ No newline at end of file