From 9298352f2e19b9cb1f17c11604040e26df3969a9 Mon Sep 17 00:00:00 2001 From: Utsho Dey Date: Thu, 19 Dec 2024 16:37:57 +0600 Subject: [PATCH 1/3] fact check from image is functional --- .../__pycache__/fact_check.cpython-312.pyc | Bin 9837 -> 16843 bytes app/api/fact_check.py | 301 ++++++++++++++---- .../fact_check_models.cpython-312.pyc | Bin 4938 -> 5979 bytes app/models/fact_check_models.py | 29 +- app/services/image_text_extractor.py | 119 +++++++ .../fact_checker_website.cpython-312.pyc | Bin 4880 -> 4880 bytes 6 files changed, 376 insertions(+), 73 deletions(-) create mode 100644 app/services/image_text_extractor.py diff --git a/app/api/__pycache__/fact_check.cpython-312.pyc b/app/api/__pycache__/fact_check.cpython-312.pyc index a189688628c487b2c9a7384063623286ff4007c8..39e026b459e74501494bf259f6d8712e331802e3 100644 GIT binary patch literal 16843 zcmeHvc~BfznrCL+Rn);z996}U90E#$!a@?VPJ|H%86+Vg*%od$E-Di!R0qz?5&{(( zukF~u5stPdI%XPAcTeNpjxY*qw&|FT813%FjI7HVcf>}O!9gj9UVCG`5!12zN6Au$ zY*1~q8N$Y4W7sro3Y&+`Vau?E z$m>D{Ve7CpY#X+P?Zb8wrVly7g~Ns6qTwR)YzP&HONL9r&S58cHik;WWy58}Zwi%% zD~2nG-yCv<-NWv%XV??29Igyk4OfM$hpWRi!!==Mm?1KjP;IzwxQ?bY6sMo;RSTBA za}Do9+3*_P0y6ab@&h&82u~%PJ@d8+o}FA%=D8W3%W~d(*#d4WYvoMbRu`l(rekey z=!Unkb`omjJY40NMtS3aH!b8%A=k=P%o-^QV&=*i*dmbe5t(90p@cJYTRDrI0_)6s z%8zMbOCe?(*Es1{ijs3<%W`wMTK>L9e#usF?lBGPdPA2f|MnS;*Zo_3?bS%;uEC?H zqA`)bYH z2&6F5-*Z-SE6=@M-KR&o`+BKQq6YCy7|80K z6gfe%4UvFT+$%9o5JAFvVv37&9qkSUxriv)x{mhx17i0C7dXf9URo+X8upKKr@0Ga z&jpc(S5XWt$q_rn2~*LCzNv(Zal{`AjR-_skhC#ABpG;4jPa3x z(mzQ{K~JGxhu?!{2<@k67?oy9B(Kn!)MmUHPaH<=RgK!xufy292vUEh{gL+H>-E$e z9jAG>s8hu!F^3HotBcb=rhckX`#w#@=^-j7j(I}_cx3f)>hU^7zK84-E{ z6NpAc=q=0`9}P3&1jkJA(euzEECapJcN$v65EMqB4UW$?4#ffiP7uaoq0kHik+Bbf zAW4VBhbViQ*)7MTYTB_7%Zze4vN>W5=^yI5U}St_LSB>P3}TlZ!5l`E45PecRlFoz zGK8Yz<1iX^!N^!tGEV#X2qY$%GXqo7bJ!CltswGVE$@aJlysP+L~{c65{6;Ee!6WU z8s^$!qA(FfzfbEy;15C{sPr2BOfmh2 zyURoU1>NP+{j3Ije~V#%=K|jc73lrIMWf$ie#W8wprQ5|i}r?I58*c~dI-7U(1Ksm z!{o!yH%#J+x#3dvf@#Pw>DllS9@Y1A#<+2_{8_6yk?~Dpl^Q=z#f>?WWS%AgFm?hO z)`9-i!mz|hFoAZiR&u!4k|E*dTJt6PR@<3C$b;uU8C7Z=yoj=jAr9a7C z9E}%Y{Qj99d*E(L53R12Vbj`1Su%1^7JIl?5*2Yh zo8ov_uwZH)3y#Nlj`htBU^HPy5VN8^qei}IW2Blu?9 zPESB0@aKmmEIbwE{rrrYx|*w;3gi>xd{>N;J`nN;!;%rqCp3m63y~v+kYrR%9}|Zw zG1x{@0e4|4mD55`7AL1BX9hrrP64TjLzHBEug zP;iz5L(?$j=lx+pGEQ)Qmg5DS*uLp2K_v251LPJTK`dL~-hZ zRL~{NL;|v9&ErHY8Q5spAB;#^Fj%ER-X9b=(1IUPGS*?WOU`jF!ts6)USS3B9CQwm z$H|3f&}l#i=MEmNJswv!Zvkl-elPg3UWqO5%&kAQ&`?NXBlCMu)C4s{jCaX~Sz}qr zBac(CWLN4NdpLjvslXr1NC*WW0XC3&@y!^cST$?1S@Wmig%5s$2i9<6)%>pl*C)06 z)*V|@#pUbHnl=CH)@yt#HBTi>m7kSu{emhryXJd8FW-`KRi~Jy zRO^mkymaNIRAu%30z;X7zBgTHwYgHYP4Btib+0>p%jd68r?pgF%L7VNSNw?5R2E-0 zfh|?!S}XJ>3%#p_+fq#Pd$xCN|Iu;T_E`zDR??6xX;^$=wZykxyK9-d_R>mi|C*;i z>Fj^x1$BsG_lb6ml(Q{e3;LveMP|o()z(yHZK}F8QRRE6*H+u-k3Y0fwNEYESD4N< zcjtWnx~p}GTYhoH)wNdA^{AUxQ~Lr_`!ekY-M&oQsmcNR%QR;7)x#FbSp}w?&9z=w z_9N?`SfQqt8~?iXm#xb+*EP3xTz}{(v&Lq1*;1Gzd5kDTB@Ns6Su*-=}+NNdD_bM>650lKvpn0wjB_tGv&R zMr4oP@PS*4{;I}4z4nIA(__=!sG)m|rW;I`4jyhc&^>0;%`IB=H)}xX=2k02xYt=Xe=!Z|+;I}O4CU`snEN#PJGGrh4(LB* z=p*%}4{LSk-$Fz9hmAUj^&!#r!w&k$9>a&bG2agl=;7^M4W@ZlZ|EyG-i6%l*1J|C z`b&%;BRR4|YlQWS{y@euhB1b-)9dhi@CUGwRd~zN01-1FI1V=MVC82qignKLEu|@8p}o#iyd$oeGlC5YR03;{ z>%n!z4d5336zs}~{RwZ2@U4i;8M!vY2ytWF#1>yI$-y4ZJX*vI=*7&I;;J`i$(HP_BIYjAmQoFcAr zKBhMshzY2pKM;rki8#~30Dj~ZE=XkzFe$*y zP>`gqDu!)A5KN0oze42(LO|~WJ(>Z2kn0TJ4dad92aaR`zEO~Y5C05?!j=`q9|C8# z0~lCrl5AtDIZgP_bBtU*D65FLl|VOUh7*0WADn=OmXQLq4IdQ*&~+*%W~w5apt=je zutBJ8z}aI#sCIv329a{uA5t=rffu%5HZLDT@G=|agh-RfggL+5OtK=N(U?CZa7-&R z852Z^=NFlv$V~W&^e6{4FJ}RmK3hmA4JpE(BC?lL3j1e}d={fv;{if)=Ye=YqS%{B zl>}iULlA@G(7J3317eUP#>bKo0g~@Dz;R>}f_xwrLQ#f`L}TL0p)WW-(RPeugRyYiKy+FaR=OQXzDP1ht!3TOhgTi+VoazPHWhHo zLGCI*&Jbv;q$7)hFb->!J`VSMNoccie7*P3)Ci*rGR@mt+hIiD9tm*#UbS2~qf$;QGZW^#f#Hu66t=hWp37Z5{ZAu$X z*i8Nmwb{JRga9AJ2?-L|tS$UI{*|jj zUIfXO+h`ORx!g<@E{rUe>f!+VxFJ@^(;KSm81#Nzg>jkvCzl&4?OV&8mmyTt@hBP> zu#(_Z12m?HBM3M5U|k`ajB?^ME(40*zx_uoMP1QKHkr^Jfvz2%0*VWkb6$P0g#8yr z#OOJgY>)GUFd6etfg|aV-TT^vuCTYQ?TSX(9mJZGy%djx7m$OQo$gr=bZkPcG3WiE zAaoq89kRs;U43tt;%B00+#s1eJpncnF6_8Vh-FcGP{xdtO|#iP?D+~S4f739nn1il z0-msoXg+9xFG_RleDwCl@;wf%y~^#FZBJaKA%c5`NQn zh4PjGJ?$6#q6k~VqtR%Hv^cS!Bpoh*{87@dL1+@-Y+-uOa>4vz>>-Gn`5qLq!4^UU z7*mCfEmO**Y)q?6tE58%lhC=61(M`N0f}(Q0Bk%LVR_t5+`#>M3uUtR4CYddw)@D2 zP7Fm*`CeJ~94eTQMk8eZppeAP>GAw9=Jp~w*xe+6DUcx#tYT{ z1tc=5biyDDg&?4=!|I(d#ryz9MQ0Lz!Z8?(%7*tPT5m~fjMaT~!sLGFq^u>EI};6i zRtxri?yg?=qqUljWKGA4d-r^Qs&4yQT~D&ECtz;e|+C<}?wZ;R<#si6igR6}v*Gqh< zGS7NN{aS@DS>apvc-K6wNl$C4!ku!ROlzpBm$l$FT+{%s?X&~0ZL_DH_DX%iRPm6Z zAhYPt#npnA<%(+~$?XHFO5gmkRP~lcZt?jvMYqHBXraA{SV71chj2dK+<&}KZOPQq>TWtnL8v zxM=s~-u3d@#m1}ktL45_vu~|=PqKN>wZ_%vu6M@nRwPcnkT`iZab$Qwo2qD%D+|sx z^d{T-KdBg4chxWUUOoJYYkR7?;b)WYOfHo!YgVdvENJg}suzX#F1>qcd2F@mK*|RP zB9^&C%l?Jl#a-_l%OtgQ{*#K1bP=eLE~Z-cUA8R*lZDNnIVu;sx>z#V^@6m!^ zGU2%wc^{8#$|{xx!KaEpkV{8ri0w+4DRPKy9jhVaCeWQgYwF?}aDnax*YH#GoL+=l zkgrWocIo1J)_B#FLt>iqNK80D!CDA~Y0NovV%+qMSHb%VNw^8Qj1;CRZe$B`C`@xs zistM=6_q07TjS=*3}vmB0?#?B`2dA!1`5*_Hxmle5;sj|&Ut0!;})P{?Hede*c_0r zxH(Hst2q-2)4XXJ9c=a}h9zzR3e)nXg;@#mU7 z*&Pj!1_7lLV=F*l6)fZktK<`VWW*6896Jb198|OfI@Bx%+Da0W8Bm0YXnH6eH>d^(h2}*}@Zia6AZ4j0(%$mY&QB&C_t$QSLAD2~Cg1(i`NVD7_oe zI#}`^%Mb_1i{0#{AJ0o#EdQUx`hSYV%64xR=|6!}$|0}5f!4|n>HLvEIEL&b0IboD z)}8-i=1Lu30G^ST6@acHdj&XNfFaGO@HjT!o4JCb7&a%3L)7zA(Gy%LnP?sK@*5Sn4 zb5Dlsxj0?qX9(W`TSiKq=Ph2DH9%YlaHEt9>WB5GWb|V!&V4WQ83=+GgD5b zn7OCD1bzl&{}23x+pzR1(D+#y8oOHOZGdb%+m?1MH32Yecc1I#Lvt^+rO_enP;4{0+0q_+(eEK^9KFU?Q=uTF&%-aA)11Jrzl{F>HnwHEfW$nK@ zyjrj)Rn5$w0LZvxPP*Fx@7Z=-HmsLaEsXs_oTMAFYj31l4yQH)V(vl^e%T?ectN^ay7}@s41K=b~+cRJf z&&B_9AsHN9o7xnT;YfUq?zPfyv>a}tmg&P=bieHC1n-UQUCnU2<-(V@+!pjV5dRh$ z{Y~axv-Z|gUG`p`_Vz&<9&UH)LG+GJ5B@u5E%;}xZDhE$;&ddy7Z`RJHH2^+3Bw`H zkYrR&@#USm$CEp+!|%a=hg~KaR>5f@8cz(tPa7xf47|^~_#ft+=-E_4g(z_1n$A7r zjUUShM=P9e7eEexIvl?nFEp~V@M;o4;f71ZF-ERQyXwo{!JO8zI#z$x zpqkS|aN3C9mIXnR8N)pri#2Zid`HZHi8%elvSRIRQck`qC-$ZiPfSr0*YJnnEYm^I zdB2$XGGOz3;xv3cfXHdX^}g~P*M8I6_Z7`Iwjk7(27spq?pnBO;cQ5z)i&1y&U888 zQ$o|fS&PEyD7GNCMR!whUVb+~yPNR)3H-iCPDEH||D6SA-Cj-%Ol*dY&|X(bS}>hq zTF%*NRge%#p${bCJOM*&7XqeQ+J|qVYZ3f3STH40c89()cgVT{*Y(Qlf0%6Ugj7B(wW8sru0@lYPI8 zvN|&#q#eHg!u4kp6(?4$gX?e>xp}2#-jCl|f#gOBNp1F}C6AjvQVq%x%R}(NCK!%g`BM(r91-{;l&Z{;dRzog%gP&IWB}o! z%Oh1Mo94LFCaa?!EAnNNfK6N_RQ~Y73?Ez!$e(iU#=<~P)#-%y>8G$vZ}hyv$6 zI=?HX=z}nxEuQ)EHM2Wub}u}=Vs3cV@IRbYKkQFXMfVCT7rwtz*z&6FUUAK$ZKc@v zs^ea{cgeR>z9&IB(*`T;T-=eS;I{Pia`%-3573*g-a|VUyVDfhmUCE2DJn(}YTz~sxBC?JJ^G~P0TE9h z)(y~dY`iZIr767M)!iLRoH}#&g};XlF$_qs6Nz9b5qTvMd?m?3iX?D44ryMZPhip~ zH6grRqC=R&1^C|-9uT4Q_cfHJWQ{IQ(&Y>Gh50Dx9gKBx-)-Po)Kt0!gd(+%apQ zQIwJHnRCxM_q*rbnfcD0^H1S__Ew#@*(?aISLBBWHqb1`_>BU;iGd|B*g1p(Hep zNXi&Vt53N7-YZ6gHY1E)MrI`92}1@CVf6`**M0;qrXVZ2slgNCNVBd&zCL5#EwYbP zBP2)j9vhcL`RoB{uEnd`yw(C{5e%UWQeei#Ylu@5yhkN1VYSa&u$2*E_PvNWGKLAG zXv?5ONVI1#NJoYN>3rQZ!KUrs;&oB4A}nD$)JOdu#+YH%uf5?S&Wtjg=%af-VNzWA zsEi46e8(R$C~kF|&)3XnOroC#ZA81DgHZErp=QOKAC_MrS&=ah70?H${>CS`%xR+- z$e0hI*9{Yv3_ld4DMT$9vlx8Mkl!$j4&HSKYN7gHpN}y^A{3|AFPJSE3khZ{OUP!w z%mc860s~m#I>;jrul6kS2K~ClB!+0}VpxA7uA6Hs*}Am%Vnq^VOmE`n@D8Me^NYFN z+JX*#S%#G2yssRQtrJ!yn(vFnickI5vgmg{&qD>l4(nsq9Z2*}z&WsD5VeFGz7X;I^3 zC83OgEj`jlBzbIDku{I{X~g5!oM4hi#z?<(Gx+#y;Y(Oe3fqtWF!X(vOMVVJSfjod zt!>tL7&|(WmL*aOnQF#8iZVLRQz^IA*Vh;{2Ika^=|oB*%V-255j9ZyTReXJe5nyv zx-@1`QivT~guaEEW!Js;b#0Gz$GSEWH#IQ5Tkltc<<;M_B;{lpvQGC)nsGQOD;gs! zM6-a2DDu9fvPa_*qwvj(no&&lE5t^Jo4fY+OQQpdXsN0DSg2y8bxI85PW0at2fJCY7 z?WSzUwu{2{&s1CG+FiK_61*==&RHwxLv_z=nJ=!GlBVv@A-p0wA6=P^G~|$BmF*K- zI4czA5LeQWGntE;p52@apm6=S#UKA=WiR%*vfldNG1;b#7gly<>$|h=o{PdJRj3TB z)z#&0Th>{3R6f>lWbffUS$oSxzEy2=gw@IF;N+{Y3s2%$3x4`;Zc_-Iwov`FwH@>i z%(%P5_<_Ypbw}sg9*oXfJFDR~-AJRRn=Djs*YyqzQRlFEQ;<1l=`?QgGBb7zof!um zKI5T5GhUWjf(*5U%&{hBCS1{JjMX!u|kGVLAz>U z^A$mX?&3u_q+ffpC%<5K8SjzryNwD;=NAxNc`MEQEz7&bR*9@XxWvoA83dVcXU$5z5 z$D4QVk<^IbSO081#1dKv)ISK#*sER^V%;PF{qaZ_C4$I+r1bAerUxT8=`=C`MmecS zk#R#?WV~=!TG~G<^(&GX(V1h(uRdVwnIuI3wX?=RXdR*_l7u{xhHfErNeD!ZpKw$+ zQK9ptPGgibNdpy>Ey*en8cX?izh>=}52X7S&p*UN9h^9lN+i={CG9v#A_?3FS@y|F zhh7YfLolh%Qkt+`g42?$=xZaio{|1(aox}{H#!7mqzEyqR9*vMrs^3cmecd!0 zg`?GFu|jN6=j{!vH&Tyv@$^NxQ6_z0c^;a)JI{(=T98oqx9HjhNA0{j`0|PuSEzS6 zHni8VjvN9pHLy@pccElumJfa6uf2i-=HN5kpA}Th2P1EUUk%Sks^(WTzPhgdXD1W z_+mNo1}}Lk<~$WwkiiyQaQd==)|37}hJPPE={QCHR5p`1UpW)V`nO+n?)b+_!2^_-s3jW2pJ zat5y;9`YA}pqC%ram;n(zQdcc?&gca>V@En!}>zz3oAdVUcOKqor)eBxXd!u?#nFK zZoi7SLf8?`3wsUX3&hzR*S68MPU`!lFZSzPF9!Utu5DS`1bzShEjPk3`N_`cJq&)o z40p9~@0T-FuUZ+aV@?N(x{>j8BkpSBPd9bip>xK7yVmh%7>4Sc0fNr(0t`6g#a->( z86QjaLfsyotlWeRXBphHmOE<9QSz zWg2;UT20fQ^6vGf=@t$qn_e$~<(#>^r_pq71+|=OWT~Z7#0)ma zma#LIj#wo-YsN8wpS2jNZpUDsbr@mTtcRvDTY_Vy+-!uVGF!<~?=lKHTgTmlOtVc; znS|M96V=z5Af_gwnFbPYm>EqHIA0+`e+$S?5E`c(7zJYKAhgbC7C7Y8^^c?nl7qJ~ z0{z?5{L}>ib#gyRjP~u59;W;w>z^vIsMGBcY*pJjN=xW(fqp((WD*DMQbO0yV8pHd zsG|yd)OR}yCL5_$|MJ_Y)&)W{j*h^|i+;AG9>i5>h|ur)D}G&SL>wEI+Q|@f;Z;EL zn;>!w#`t5j{O?Hk2n9Yy+vd==k5T7GsPUSC$A)VN#APbJbR*pUsW+@{&LY=`&d@83 z4_$CBf83Vi_F`dzs1Ke=)7eri{C^Ml!k4rDhUnClw9s{3+;QJ?RO7PVtXwxs?F Sg8Nkl8PfQ&0aTKNZ2kvOZqbea diff --git a/app/api/fact_check.py b/app/api/fact_check.py index 4d870a8..ee94bd6 100644 --- a/app/api/fact_check.py +++ b/app/api/fact_check.py @@ -1,9 +1,13 @@ from fastapi import APIRouter, HTTPException import httpx -from typing import Union +import asyncio +import logging +from typing import Union, Optional, Dict, Any from app.config import GOOGLE_API_KEY, GOOGLE_FACT_CHECK_BASE_URL, OPENAI_API_KEY from app.api.scrap_websites import search_websites, SearchRequest -from app.services.openai_client import OpenAIClient +from app.services.openai_client import OpenAIClient, AIFactChecker +from app.services.image_text_extractor import ImageTextExtractor +from app.models.ai_fact_check_models import AIFactCheckResponse from app.models.fact_check_models import ( FactCheckRequest, FactCheckResponse, @@ -15,11 +19,91 @@ from app.models.fact_check_models import ( ) from app.websites.fact_checker_website import get_all_sources +# Setup logging +logger = logging.getLogger(__name__) + fact_check_router = APIRouter() openai_client = OpenAIClient(OPENAI_API_KEY) +ai_fact_checker = AIFactChecker(openai_client) +image_text_extractor = ImageTextExtractor(OPENAI_API_KEY) -async def generate_fact_report(query: str, fact_check_data: dict) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]: +async def process_url_content(url: str) -> Optional[str]: + """Extract text content from the provided URL.""" + try: + # Add await here + text = await image_text_extractor.extract_text(url, is_url=True) + if text: + logger.info(f"Successfully extracted text from URL: {text}") + else: + logger.warning(f"No text could be extracted from URL: {url}") + return text + except Exception as e: + logger.error(f"Error extracting text from URL: {str(e)}") + return None + + +async def process_fact_check(query: str) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]: + """Process a single fact check query.""" + if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL: + return UnverifiedFactCheckResponse( + claim=query, + verdict=VerdictEnum.UNVERIFIED, + confidence=ConfidenceEnum.LOW, + sources=[], + evidence="The fact-checking service is not properly configured.", + explanation="The system is missing required API configuration for fact-checking services.", + additional_context="This is a temporary system configuration issue." + ) + + headers = {"Content-Type": "application/json"} + async with httpx.AsyncClient() as client: + fact_checker_sources = get_all_sources() + + for source in fact_checker_sources: + params = { + "key": GOOGLE_API_KEY, + "query": query, + "languageCode": "en-US", + "reviewPublisherSiteFilter": source.domain, + "pageSize": 10, + } + + try: + response = await client.get( + GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers + ) + response.raise_for_status() + json_response = response.json() + + if json_response.get("claims"): + return await generate_fact_report(query, json_response) + + except Exception as e: + logger.error(f"Error with source {source.domain}: {str(e)}") + continue + + try: + search_request = SearchRequest( + search_text=query, + source_types=["fact_checkers"] + ) + + ai_response = await search_websites(search_request) + return await generate_fact_report(query, ai_response) + + except Exception as e: + logger.error(f"Error in AI fact check: {str(e)}") + return await generate_fact_report(query, { + "status": "no_results", + "verification_result": { + "no_sources_found": True, + "reason": str(e) + } + }) + + +async def generate_fact_report(query: str, fact_check_data: dict | AIFactCheckResponse) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]: """Generate a fact check report using OpenAI based on the fact check results.""" try: base_system_prompt = """You are a professional fact-checking reporter. Your task is to create a detailed fact check report based on the provided data. Focus on accuracy, clarity, and proper citation of sources. @@ -31,12 +115,23 @@ Rules: 4. Maintain objectivity in the report 5. If no reliable sources are found, provide a clear explanation why""" + # Handle both dictionary and AIFactCheckResponse + if hasattr(fact_check_data, 'verification_result'): + # It's an AIFactCheckResponse + has_sources = bool(fact_check_data.sources) + verification_result = fact_check_data.verification_result + fact_check_data_dict = fact_check_data.dict() + else: + # It's a dictionary + has_sources = bool(fact_check_data.get("claims") or fact_check_data.get("urls_found")) + verification_result = fact_check_data.get("verification_result", {}) + fact_check_data_dict = fact_check_data + # If no sources were found, return an unverified response - if not fact_check_data.get("claims") and ( - not fact_check_data.get("urls_found") or - fact_check_data.get("status") == "no_results" or - fact_check_data.get("verification_result", {}).get("no_sources_found") - ): + if not has_sources or ( + isinstance(fact_check_data, dict) and + fact_check_data.get("status") == "no_results" + ) or (verification_result and verification_result.get("no_sources_found")): return UnverifiedFactCheckResponse( claim=query, verdict=VerdictEnum.UNVERIFIED, @@ -63,10 +158,10 @@ Rules: "additional_context": "Important context about the verification process, limitations, or broader implications (1-2 sentences)" }""" - if "claims" in fact_check_data: + if isinstance(fact_check_data, dict) and "claims" in fact_check_data: system_prompt = base_system_prompt user_prompt = f"""Query: {query} - Fact Check Results: {fact_check_data} + Fact Check Results: {fact_check_data_dict} {base_user_prompt} @@ -75,11 +170,10 @@ Rules: 2. Specify verification dates when available 3. Name the fact-checking organizations involved 4. Describe the verification process""" - else: system_prompt = base_system_prompt user_prompt = f"""Query: {query} - Fact Check Results: {fact_check_data} + Fact Check Results: {fact_check_data_dict} {base_user_prompt} @@ -116,7 +210,7 @@ Rules: return FactCheckResponse(**response_data) except Exception as validation_error: - print(f"Response validation error: {str(validation_error)}") + logger.error(f"Response validation error: {str(validation_error)}") return UnverifiedFactCheckResponse( claim=query, verdict=VerdictEnum.UNVERIFIED, @@ -128,7 +222,7 @@ Rules: ) except Exception as e: - print(f"Error generating fact report: {str(e)}") + logger.error(f"Error generating fact report: {str(e)}") return UnverifiedFactCheckResponse( claim=query, verdict=VerdictEnum.UNVERIFIED, @@ -138,69 +232,138 @@ Rules: explanation="The system encountered an unexpected error while processing the fact check request.", additional_context="This is a technical error and does not reflect on the truthfulness of the claim." ) + +async def combine_fact_reports(query: str, url_text: str, query_result: Dict[str, Any], url_result: Dict[str, Any]) -> Union[FactCheckResponse, UnverifiedFactCheckResponse]: + """Combine fact check results from query and URL into a single comprehensive report.""" + try: + system_prompt = """You are a professional fact-checking reporter. Your task is to create a comprehensive fact check report by combining and analyzing multiple fact-checking results. Focus on accuracy, clarity, and proper citation of all sources. + +Rules: +1. Include all source URLs and names from both result sets +2. Compare and contrast findings from different sources +3. Include dates when available +4. Note any discrepancies between sources +5. Provide a balanced, objective analysis""" + + user_prompt = f"""Original Query: {query} +Extracted Text from URL: {url_text} + +First Fact Check Result: {query_result} +Second Fact Check Result: {url_result} + +Generate a comprehensive fact check report in this exact JSON format: +{{ + "claim": "Write the exact claim being verified", + "verdict": "One of: True/False/Partially True/Unverified", + "confidence": "One of: High/Medium/Low", + "sources": [ + {{ + "url": "Full URL of the source", + "name": "Name of the source organization" + }} + ], + "evidence": "A concise summary of the key evidence from both sources (2-3 sentences)", + "explanation": "A detailed explanation combining findings from both fact checks (3-4 sentences)", + "additional_context": "Important context about differences or similarities in findings (1-2 sentences)" +}} + +The report should: +1. Combine sources from both fact checks +2. Compare findings from both analyses +3. Note any differences in conclusions +4. Provide a unified verdict based on all available information""" + + response = await openai_client.generate_text_response( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=1000 + ) + + response_data = response["response"] + + # Clean up sources from both results + if isinstance(response_data.get("sources"), list): + cleaned_sources = [] + for source in response_data["sources"]: + if isinstance(source, str): + url = source if source.startswith("http") else f"https://{source}" + cleaned_sources.append({"url": url, "name": source}) + elif isinstance(source, dict): + url = source.get("url", "") + if url and not url.startswith("http"): + source["url"] = f"https://{url}" + cleaned_sources.append(source) + response_data["sources"] = cleaned_sources + + if response_data["verdict"] == "Unverified" or not response_data.get("sources"): + return UnverifiedFactCheckResponse(**response_data) + return FactCheckResponse(**response_data) + + except Exception as e: + logger.error(f"Error combining fact reports: {str(e)}") + return UnverifiedFactCheckResponse( + claim=query, + verdict=VerdictEnum.UNVERIFIED, + confidence=ConfidenceEnum.LOW, + sources=[], + evidence="An error occurred while combining fact check reports.", + explanation="The system encountered an error while trying to combine results from multiple sources.", + additional_context="This is a technical error and does not reflect on the truthfulness of the claim." + ) @fact_check_router.post("/check-facts", response_model=Union[FactCheckResponse, UnverifiedFactCheckResponse]) async def check_facts(request: FactCheckRequest): """ Fetch fact check results and generate a comprehensive report. + Handles both query-based and URL-based fact checking. """ - if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL: - return UnverifiedFactCheckResponse( - claim=request.query, - verdict=VerdictEnum.UNVERIFIED, - confidence=ConfidenceEnum.LOW, - sources=[], - evidence="The fact-checking service is not properly configured.", - explanation="The system is missing required API configuration for fact-checking services.", - additional_context="This is a temporary system configuration issue." - ) + url_text = None + query_result = None + url_result = None - headers = {"Content-Type": "application/json"} - async with httpx.AsyncClient() as client: - fact_checker_sources = get_all_sources() - - for source in fact_checker_sources: - params = { - "key": GOOGLE_API_KEY, - "query": request.query, - "languageCode": "en-US", - "reviewPublisherSiteFilter": source.domain, - "pageSize": 10, - } - - try: - response = await client.get( - GOOGLE_FACT_CHECK_BASE_URL, params=params, headers=headers - ) - response.raise_for_status() - json_response = response.json() - - if json_response.get("claims"): - return await generate_fact_report(request.query, json_response) - - except httpx.RequestError as e: - print(f"Error fetching results for site {source.domain}: {str(e)}") - continue - except Exception as e: - print(f"Unexpected error for site {source.domain}: {str(e)}") - continue - - try: - search_request = SearchRequest( - search_text=request.query, - source_types=["fact_checkers"] + # If URL is provided, try to extract text + if request.url: + url_text = await process_url_content(request.url) + if not url_text and not request.query: + # Only return early if URL text extraction failed and no query provided + return UnverifiedFactCheckResponse( + claim=f"URL check requested: {request.url}", + verdict=VerdictEnum.UNVERIFIED, + confidence=ConfidenceEnum.LOW, + sources=[], + evidence="Unable to extract text from the provided URL.", + explanation="The system could not process the content from the provided URL. The URL might be invalid or inaccessible.", + additional_context="Please provide a valid URL or a text query for fact-checking." ) + + # If URL text was successfully extracted, process it + if url_text: + logger.info(f"Processing fact check for extracted text: {url_text}") + url_result = await process_fact_check(url_text) - ai_response = await search_websites(search_request) - return await generate_fact_report(request.query, ai_response) + # Process query if provided + if request.query: + query_result = await process_fact_check(request.query) - except Exception as e: - print(f"Error in AI fact check: {str(e)}") - return await generate_fact_report(request.query, { - "status": "no_results", - "verification_result": { - "no_sources_found": True, - "reason": str(e) - } - }) \ No newline at end of file + # If both results are available, combine them + if query_result and url_result and url_text: + return await combine_fact_reports(request.query, url_text, + query_result.dict(), url_result.dict()) + + # If only one result is available + if query_result: + return query_result + if url_result: + return url_result + + # If no valid results + return UnverifiedFactCheckResponse( + claim=request.query or f"URL: {request.url}", + verdict=VerdictEnum.UNVERIFIED, + confidence=ConfidenceEnum.LOW, + sources=[], + evidence="Failed to process fact-checking request.", + explanation="The system encountered errors while processing the fact checks.", + additional_context="Please try again with different input or contact support if the issue persists." + ) \ No newline at end of file diff --git a/app/models/__pycache__/fact_check_models.cpython-312.pyc b/app/models/__pycache__/fact_check_models.cpython-312.pyc index 91cf86c610d9265d0dcc28d32913c7e643b06482..64a94030489fd6cd337f62bc6b5b437c1b1fca86 100644 GIT binary patch delta 2519 zcmbtWOKclO7@pm=*Y9WSv`G_pORKooyoi_hmK@O=F&GtwQN_HQl!1*^M6C904knvfJUF)3S*_08GvJVKHI&UXOc zxsET9WN>;ms)o4K15z&)=A>ntFz5P!>tDxh;M@RkgX_2v&J6)Kyp9{4Xh=k!;dTkT z*(9`uEk7$1jMVjXE~}=E0!!G;UZK%#IZuuo*+M>@dw5=m3331|Sirm~?v%IH$gjl9 zo(3c%2vO4~ZHYD`$>l9b#WhCMv0XH^C%S(SMwghbE^9#3bC zzEPUFdYWD<+Nf>}nm}=wv8Ibxc>Tq1N25(8_&`9 zOhG%4W=1xXqY07ig`?R%081*e+?j-6N#l%W#AT20g|)-);wZpfGAWELyAI2hrwr}3 z;`RIDrYg{Ms~XazsQV@*-D4~_2IN6nk$xyl3PU7eD|MZ#)s!m6XxdQVlcQ+Mam|W0#m^8vXa%6aaOpVRjJy}_Jd!n9iY_FTW1yLW^uHOgJE3Jim)YQZN)kv zTG9wLEH|89q{kS;ma?QwPPb%evLe;jw%744DDadS_KOu=?n0VM8!7H=3Y;yqZ73Sl ztbJ3xmtYJ1HGnDdTqdC{ciq$OnXdWZ&WgS2k9<6Kdu%2;@84dLo&+NIhJFvXJZgPs zK78tt_f*vth*o^Oam{z_G4X>YbJBjWTv8wgycJZz?iemG4U(80UX8V##o@Tp4wSCs zVKvR2N@g_8pqD7q??X~@G|yfE_4^|0gqI~;*Rf3$r53w)H6}=FsTD#luQlq1m_d$h zr|$ybp?ZCl?k|RBL!Y0UO@Ef;fgPC-_dfFWRs+F`lgE|tv2~XfEE(p$RtgVLmZTfZ z3Zzo$e7<0$@#>fLREl>2bD-+02r|MO2z6Jz4S5JS4fGoTACV{9l&aHPZHQDG8mo~v ztHFUKkK7_p9jhF?(|5ajk;ry=!B3jEqgG^JHMoDtVQZGB`YK!R?7ZFDvq)^9;3aZo z#`f6Wx**E-#zl$9vFAuF1c@A|6zA>H1<_^?p`ejdwE@&-{%OnFx&P)F`xPN>{$M}c z!Mf1TL+InWI!-kc?do5)qezlf`%HaPEXa`+yD(prWLf~$lMd};^ zcklvI7tJrd!@@q(Rgrr&#a9PkomKl_!CR&ZFkrZwzxVVXG^n6&lkE z`IHV5UrVRx4TGiYa$(7Rh5T@Kgr(r9Df2?$c$wEQ4a>nqxNg7N#rognz|HIYV3&}G zAf(J#@Nw-&^^*B_a2C>XFBI>sYmBva;Fo7Vm+r+3&juvu`Ku~k@D4hrsqxGW1~9fAZ6jnXtT%`$9vk5u=d zb8(E(#G_gVaB;bFtUi`J_#VYlOrN#@(HzE@T6d#~Q? zANwEGi;oKh6YHFfu;7`8#5w%ephbY1>jwDx#WLJ(9SHYl5VJ=c#HPT!y z(p_C>a%e<`YY47{W@Nb*MiN>^?cCR>tp@qPy01VUgxrGAHLmO1^rFzu8$4hJ_`dw# z_L7h-uI9I>WuX?hTHK;mgj(Wid5b#s10K&;`&Gx@fIA(9?#A^AM`p@OCrO#M(n$m2 zhb(s??sVhzKAysAfo~)QdRux`ogC3$q{~(X=omni&dNK+CxD9hE5LRNpqJ#S(V?Am zAWz~s`jfnGS6iFs(Z22&tbEdMrHj{t*4hWb&0dhC`{`-r?x5*tWJ?0L3tC3r;K1Z0 zC9LFmejImGKjl6X&m)=qGq4@N=HFxwhi}mDmD*6ZhE;o5tqtvC!}7|erH-jzE`57w z6RA0MWS<|Rma&>+a*u@cGvg}WOPMkMmLoAa3}R+}=!d;viI6S?>#ULsJk=WY zswW5HNAVF~Hl4A|eWR&<>DI&L86V^vXaM*O-E03gdVq8EANvk(%boGY@yrmsczOj+ zehlylK;~U9f(Gyf)oX|HqBvMK9h)xKrUpz+`=55=)zP5nu-gEDf2~Xj;$B27Sc`mO zz{G6LfF{OD432OwQf*KFw>}?r+r2P2N_>6-W+J)C;V&6u{1hGjRY!-PqQ(={pwDYF Vc@3-I^*0g!vuV@%j)Mx={TC{4B3S?c diff --git a/app/models/fact_check_models.py b/app/models/fact_check_models.py index 59ffbfe..3ae7d50 100644 --- a/app/models/fact_check_models.py +++ b/app/models/fact_check_models.py @@ -1,5 +1,5 @@ -from pydantic import BaseModel, Field, HttpUrl, validator -from typing import List, Literal, Union +from pydantic import BaseModel, Field, HttpUrl, validator, root_validator +from typing import List, Literal, Union, Optional from datetime import datetime from enum import Enum @@ -18,13 +18,34 @@ class ConfidenceEnum(str, Enum): class FactCheckRequest(BaseModel): - query: str = Field( - ..., + query: Optional[str] = Field( + None, min_length=3, max_length=500, description="The claim or statement to be fact-checked", example="Did NASA confirm finding alien structures on Mars in 2024?", ) + url: Optional[str] = Field( + None, + description="URL to be fact-checked", + example="https://example.com/article", + ) + + @root_validator(pre=True) + def validate_at_least_one(cls, values): + """Validate that at least one of query or url is provided.""" + query = values.get('query') + url = values.get('url') + if not query and not url: + raise ValueError("At least one of 'query' or 'url' must be provided") + return values + + @validator('url') + def validate_url(cls, v): + """Validate URL format if provided.""" + if v is not None and len(v) < 3: + raise ValueError("URL must be at least 3 characters") + return v class Source(BaseModel): diff --git a/app/services/image_text_extractor.py b/app/services/image_text_extractor.py new file mode 100644 index 0000000..395ffc3 --- /dev/null +++ b/app/services/image_text_extractor.py @@ -0,0 +1,119 @@ +import base64 +import requests +import os +from io import BytesIO +from typing import Tuple, Optional +import logging +import aiohttp + +logger = logging.getLogger(__name__) + +class ImageTextExtractor: + def __init__(self, api_key: str): + """Initialize ImageTextExtractor with OpenAI API key.""" + self.api_key = api_key + self.api_url = "https://api.openai.com/v1/chat/completions" + self.headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + def encode_image(self, image_path: str) -> str: + """Encode a local image into base64.""" + try: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + except Exception as e: + logger.error(f"Error encoding image: {str(e)}") + raise Exception(f"Error encoding image: {e}") + + async def fetch_image_from_url(self, image_url: str) -> Tuple[str, str]: + """Fetch an image from a URL and encode it as base64.""" + try: + async with aiohttp.ClientSession() as session: + async with session.get(image_url) as response: + if response.status != 200: + raise Exception(f"Failed to fetch image: Status {response.status}") + + content_type = response.headers.get('Content-Type', '') + if "text/html" in content_type: + raise ValueError("The URL points to a webpage, not an image") + if "image" not in content_type: + raise ValueError("The URL does not point to a valid image") + + image_data = await response.read() + image_format = "jpeg" if "jpeg" in content_type or "jpg" in content_type else "png" + base64_image = base64.b64encode(image_data).decode('utf-8') + return base64_image, image_format + + except aiohttp.ClientError as e: + logger.error(f"Error fetching image from URL: {str(e)}") + raise Exception(f"Error fetching image from URL: {e}") + except ValueError as e: + raise + except Exception as e: + logger.error(f"Unexpected error processing image URL: {str(e)}") + raise Exception(f"Unexpected error processing image: {e}") + + async def extract_text(self, image_input: str, is_url: bool = False) -> Optional[str]: + """Extract text from an image, either from a local path or URL.""" + try: + if is_url: + try: + base64_image, image_format = await self.fetch_image_from_url(image_input) + except ValueError as e: + if "webpage" in str(e): + return None + raise + else: + if not os.path.exists(image_input): + raise FileNotFoundError(f"Image file not found: {image_input}") + base64_image = self.encode_image(image_input) + image_format = "jpeg" if image_input.endswith(".jpg") else "png" + + payload = { + "model": "gpt-4-turbo-2024-04-09", # Updated model name + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Extract and return only the key text from this image in the original language. Do not provide translations or explanations." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/{image_format};base64,{base64_image}" + } + } + ] + } + ], + "max_tokens": 300 + } + + async with aiohttp.ClientSession() as session: + async with session.post(self.api_url, headers=self.headers, json=payload) as response: + if response.status != 200: + error_content = await response.text() + logger.error(f"API request failed: Status {response.status}, Response: {error_content}") + raise Exception(f"API request failed with status {response.status}") + + result = await response.json() + logger.debug(f"GPT-4 API Response: {result}") + + if 'choices' in result and len(result['choices']) > 0: + extracted_text = result['choices'][0]['message']['content'].strip() + if extracted_text: + return extracted_text + return None + + except (aiohttp.ClientError, ValueError, FileNotFoundError) as e: + logger.error(f"Error in text extraction: {str(e)}") + return None + except Exception as e: + logger.error(f"Unexpected error in text extraction: {str(e)}") + return None + + return None \ No newline at end of file diff --git a/app/websites/__pycache__/fact_checker_website.cpython-312.pyc b/app/websites/__pycache__/fact_checker_website.cpython-312.pyc index e4ce169e3a1593c1a2b7f1024be19bf4dd22ec0f..c943a2ce632655db9ed91fc3630d29d43816a2d0 100644 GIT binary patch delta 20 acmbQBHbIU1G%qg~0}v#BPu|GQDhvQNngsR$ delta 20 acmbQBHbIU1G%qg~0}v$TCvN0s6$St^X9R5k -- 2.45.3 From 7c4dd378cda07bc383b9369b012f3331d0477876 Mon Sep 17 00:00:00 2001 From: Utsho Dey Date: Thu, 19 Dec 2024 16:47:18 +0600 Subject: [PATCH 2/3] fact check from image is functional --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1fbe5de..b55be16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,11 +42,12 @@ mccabe==0.7.0 mdurl==0.1.2 multidict==6.1.0 mypy-extensions==1.0.0 -numpy==1.26.4 +numpy==2.2.0 openai==0.28.0 orjson==3.10.12 packaging==24.2 pathspec==0.12.1 +pillow==11.0.0 platformdirs==4.3.6 pluggy==1.5.0 propcache==0.2.1 -- 2.45.3 From c8735de51eda4b78920bc7c70523d718a9cf96bc Mon Sep 17 00:00:00 2001 From: Utsho Dey Date: Thu, 19 Dec 2024 16:49:17 +0600 Subject: [PATCH 3/3] fact check from image is functional --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b55be16..d4bd3bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,7 +42,7 @@ mccabe==0.7.0 mdurl==0.1.2 multidict==6.1.0 mypy-extensions==1.0.0 -numpy==2.2.0 +numpy==1.26.4 openai==0.28.0 orjson==3.10.12 packaging==24.2 -- 2.45.3