From 1a1a713e0fbe79edef9e90764568a3a2f7c9b8fc Mon Sep 17 00:00:00 2001 From: Utsho Dey Date: Thu, 12 Dec 2024 17:31:44 +0600 Subject: [PATCH] base code added --- .gitignore | 2 +- app/__pycache__/config.cpython-312.pyc | Bin 507 -> 507 bytes .../__pycache__/fact_check.cpython-312.pyc | Bin 3800 -> 4224 bytes app/api/ai_fact_check.py | 112 ++++++++++ app/api/fact_check.py | 52 ++++- app/api/scrap_websites.py | 160 ++++++++++++++ .../fact_check_models.cpython-312.pyc | Bin 5577 -> 9611 bytes app/models/fact_check_models.py | 201 ++++++++++++++---- app/services/openai_client.py | 173 +++++++++++++++ main.py | 4 + 10 files changed, 656 insertions(+), 48 deletions(-) create mode 100644 app/api/ai_fact_check.py create mode 100644 app/api/scrap_websites.py create mode 100644 app/services/openai_client.py diff --git a/.gitignore b/.gitignore index 21d6e87..cd4609c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ env .env test.py -/__pycache__/ \ No newline at end of file +__pycache__ \ No newline at end of file diff --git a/app/__pycache__/config.cpython-312.pyc b/app/__pycache__/config.cpython-312.pyc index 267bba1df1c923dade63cc9cc26619d42bbdc2a5..91b068866373f8d9210744c00633448a6572698b 100644 GIT binary patch delta 19 Zcmey({F|BUG%qg~0}y=7-^lfm5db~D1_J;9 delta 19 Zcmey({F|BUG%qg~0}yNo-pKWl5db@^1*QN1 diff --git a/app/api/__pycache__/fact_check.cpython-312.pyc b/app/api/__pycache__/fact_check.cpython-312.pyc index 56cc8981234a04f5d9b6da8c11310f6f86677fde..d2530f5d85725dc2f22291a59760ae01d00a57cd 100644 GIT binary patch delta 2399 zcmaJ@U2Gf25#Htf@b~!tCy|uNNTMRkmSszhklKxFJC^J?jRL7LP_~J3=B`xAJf3=Y zq+J8LDiY+O=-AAeYwuLUprlMWVMe)_;`lC$8100G36U6Q-t zmOKTIE%RlsI1NpIl0yr?oqTogL5puWD2SY`jxvBe!PizUet zPAH13OyjoJ^CS5juDqg(^B;T)b(l?%XqLNlIs z+4B*Ga4fkvop~O-N^-CBMvxmn#jmr(i>$W*a$X?5zhmBIe#vp5?nEx#Sp$l$6RwL` z5I>FhxwMe&KnO~efbO~ue*+iX#+#1rM%}H$g-B=n@m+VX(=5m$Z`rN8Yb2}H1G*cz zDKPR-TK6EY?jaBCK|Y=ER7a$Q&VDCi+XDHhT^s!;6kebp1YXp+|KTV+;X*uCYb;oO zp9MNp2Q-eL*$V|V=OaFZ!f*3q!zhBHI)`E?juKb6x=+hIvhX1CyOHdaT3yg~8BcRP z4K3F((P8Hdbg28Z-i}`D?{GTXvFO}&P8S{{{6EXbRyx!1MqO7DjvL+UuAK+e{kne! zjDt&Xwa2*2_l|lsT8j2Uew|-PcBIGmKL8ZaXzAHY_ZqvMsf_ncFTuEooOQx2!6onp z^JB0C-)ENKVA?qA{66e6t~$qI&M=+(gdyVklEoKkYp6!QyKg|y@EclHEKiGCsWQJD zvfCxQsK`aEszj4rc6=|Z^cs6z2O7)CQx&y3H&-f_a7D|Ws?5oXMEvB6#k?plE%zJ~ zRgAK7B`em%QdvA(#@RX2u-PK*SarGoC_*J_E|#-3tdt0l?VXjx^VzdFTfrD%G{{*_ zvWw^d@D%0BTnXVy5nHYqTx+v#1&gX&A>mm*sbt4FTO!d6GL}F>mlm~IjrO@}g^ENf zWBSmJ%BRViSZ+(kzufzv%NX|bHzH4lG^=2h^uOg)<*GuC!t#+w=uyp9Ns5-&h9|r@ zkBLoHl0@bFpTJ;5q3>65Q$l-L5eTtZnu`*oa)8QC8>^xzJ}U9z;v%jf^YE+J7Pz&i@EUn}7y3u5Isi9(s6p}2^XJ$O}`V)j3mJuoe= zT?VtXB2?vdVnu6Q9dH2f5Ae+jXr+NC5?eWKCb~A+sBy-7IJv``(fG;_zYJ#{f;1QV z72k5Cj6ZsZ8ee95ubymX_C5rj&9M!Hgz}svYFbwHurgIEN7+%@1;kY>5+~2%|LVi$v>x0wPG}s zZ(0cw=!3&X+_$&Uii7B|8BdwfoEbZ4CXYPu@+p3kO>BojBxA-CW_QYr#kZYogkL$m z?FYe}>5u(Z{}uf^$yPCK}`|J&`|#&zG}SEk6h|Kg~3 zG7fL~dndwj+W&g~H3*}dyY{t!&5WIRXft2M+z!Ws JDAn1|`8P@mf6D*> delta 1958 zcmZ`)YiJZ#6h3!eJNtfQH%X0oZFY^D)%dKoO&S`LXfYb0R&>`&Ty`dLlih4OGcj6s zo1jwq2-t9`3bj89h1$mcDu{(bRkRdZNaVhSP}5LK@14!W{Adr%x96U7 z?z#88?vB>ZR=R$4I&DCx=U}Y&d;7SnocSBZH*Oz6Ac4fEEg3N`W~_0mu@=&{j6H78 zIO2|sGw#f|;x0pONxL(ixF_R{dl4`oqY3JgP~6ABK9IzVAX#O$Z$nYK7AM%^D(;tk zebH4-=PD=JWV>XST`BY*=(xt+aGEY8m+X+-vb~3sJQumuE0p3*XorAwu055+{b*|w zr+fD%l3Lp-Iobc3tPW-eR9W||5xqGitD4TYr4y-)?(9&Mtg>SF^&cpQPyqL{RmhFs zXWuvwAQ#o0-QCG_LRFJ;I^EqJK}tE1s))cV%5wx2j|ncSa1vO$2u~T74;6rbvd+-s z?OdwNPS^|NTX@I-bOM4a& z`wq`To|D8JcN&zJa)hH-XopLDlIevcbE1Om=QXdfk$~h)L5|1Y@-?<`W~`EfypZSd zL%wXYFhN5ggkY?ngB()pHNGez+w7WUm3K%^{DgZRRpK~bHKJ9UiX1XscCJvqjdN$zXR4UZXlUd)MyVKm*wU!5~6ip<7uBEHdIUsOtUox&-ARsnflA<3L^DB$iZ3O zh*!W4YiI6`xAPBPK2l&|L-h05r?Kn#TgokI zdRMsZcI#Al?@W0ARCxanO_Rss)8XSY;qIw$_vFc*$y0qZ;r=)IpCohWJ#3U@ZSDc@An9un?W zlo0-IW9v?0bT5d=;$DcS81Z#9@b{WR9bx`{wLrA@!#t7PZ(s=@DZ_VN(GuO37#wUQ zb?!;^uJi=v-9Zn{ze4{OjoK>?eArzt(56sac+?$gpwGF^4Q5r%sD#o&RmK}mM{rqe z&Sa&bwA`Y^@XzjbzTcqicW8aeFo;^`|+4?_OQ>MaE@n4KN-2D1>bGR&Nx I$_$tP0)hR%eE FactCheckResponse: Check facts using multiple fact-checking sources """ all_results = [] + verified_results = [] # Validate configuration if not GOOGLE_API_KEY or not GOOGLE_FACT_CHECK_BASE_URL: @@ -46,6 +51,8 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: # Get all sources in priority order all_sources = get_all_sources() + all_sources_list = [] # To store source URLs + contexts_used = [] # To store context snippets for source in all_sources: try: @@ -58,11 +65,17 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: if "claims" in result: # Validate each claim through Pydantic - validated_claims = [ - Claim(**claim).dict() - for claim in result["claims"] - ] - all_results.extend(validated_claims) + for claim in result["claims"]: + validated_claim = Claim(**claim).dict() + all_results.append(validated_claim) + + # Extract source and context information + if "claimReview" in validated_claim: + review = validated_claim["claimReview"][0] + if "publisher" in review and "site" in review["publisher"]: + all_sources_list.append(review["publisher"]["site"]) + if "textualRating" in review: + contexts_used.append(review["textualRating"]) except HTTPException: raise @@ -81,14 +94,33 @@ async def check_facts(request: FactCheckRequest) -> FactCheckResponse: ).dict() ) - # Create the response using Pydantic model + # Prepare the verification result + verification_result = { + "verdict": "Insufficient Information", # Default verdict + "confidence": "Low", + "evidence": contexts_used, + "reasoning": "Based on available fact checks", + "missing_info": "Additional verification may be needed" + } + + # Create token usage information + token_usage = TokenUsage( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0 + ) + + # Create the response using Pydantic model with all required fields response = FactCheckResponse( query=request.content, total_claims_found=len(all_results), results=all_results, + verification_result=verification_result, + sources=list(set(all_sources_list)), + context_used=contexts_used, + token_usage=token_usage, summary={ - "total_sources": len(set(claim.get("claimReview", [{}])[0].get("publisher", {}).get("site", "") - for claim in all_results if claim.get("claimReview"))), + "total_sources": len(set(all_sources_list)), "fact_checking_sites_queried": len(all_sources) } ) diff --git a/app/api/scrap_websites.py b/app/api/scrap_websites.py new file mode 100644 index 0000000..8a1f48f --- /dev/null +++ b/app/api/scrap_websites.py @@ -0,0 +1,160 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import List, Dict +import requests +from bs4 import BeautifulSoup +import urllib.parse +import numpy as np +from app.services.openai_client import OpenAIClient +from app.config import OPENAI_API_KEY + +scrap_websites_router = APIRouter() + +class SearchRequest(BaseModel): + search_text: str + site_domains: List[str] + +class UrlSimilarityInfo(BaseModel): + url: str + similarity: float + extracted_text: str + +class SearchResponse(BaseModel): + results: Dict[str, List[str]] + error_messages: Dict[str, str] + url_similarities: Dict[str, List[UrlSimilarityInfo]] + +def extract_url_text(url: str) -> str: + """Extract and process meaningful text from URL path with improved cleaning""" + try: + # Parse the URL and get the path + parsed = urllib.parse.urlparse(url) + path = parsed.path + + # Remove common URL parts and file extensions + path = path.replace('.html', '').replace('/index', '').replace('.php', '') + + # Split path into segments + segments = [seg for seg in path.split('/') if seg] + + # Remove dates and numbers + cleaned_segments = [] + for segment in segments: + # Replace hyphens and underscores with spaces + segment = segment.replace('-', ' ').replace('_', ' ') + + # Filter out segments that are just dates or numbers + if not (segment.replace(' ', '').isdigit() or + all(part.isdigit() for part in segment.split() if part)): + cleaned_segments.append(segment) + + # Remove very common words that don't add meaning + common_words = { + 'www', 'live', 'news', 'intl', 'index', 'world', 'us', 'uk', + 'updates', 'update', 'latest', 'breaking', 'new', 'article' + } + + # Join segments and split into words + text = ' '.join(cleaned_segments) + words = [word.lower() for word in text.split() + if word.lower() not in common_words and len(word) > 1] + + return ' '.join(words) + except Exception: + return '' + +def google_search_scraper(search_text: str, site_domain: str) -> List[str]: + query = f"{search_text} \"site:{site_domain}\"" + encoded_query = urllib.parse.quote(query) + base_url = "https://www.google.com/search" + url = f"{base_url}?q={encoded_query}" + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + search_results = soup.find_all('div', class_='g') + + urls = [] + for result in search_results[:5]: + link = result.find('a') + if link and 'href' in link.attrs: + url = link['href'] + if url.startswith('http'): + urls.append(url) + + return urls[:5] + + except requests.RequestException as e: + raise HTTPException(status_code=500, detail=f"Error scraping {site_domain}: {str(e)}") + +def calculate_similarity(query_embedding: List[float], url_embedding: List[float]) -> float: + query_array = np.array(query_embedding) + url_array = np.array(url_embedding) + + similarity = np.dot(url_array, query_array) / ( + np.linalg.norm(url_array) * np.linalg.norm(query_array) + ) + return float(similarity) + +@scrap_websites_router.post("/search", response_model=SearchResponse) +async def search_websites(request: SearchRequest): + results = {} + error_messages = {} + url_similarities = {} + + # Initialize OpenAI client + openai_client = OpenAIClient(OPENAI_API_KEY) + + # Enhance search text with key terms + search_context = request.search_text + query_embedding = openai_client.get_embeddings([search_context])[0] + + # Higher similarity threshold for better filtering + SIMILARITY_THRESHOLD = 0.75 + + for domain in request.site_domains: + try: + urls = google_search_scraper(request.search_text, domain) + url_sims = [] + valid_urls = [] + + for url in urls: + url_text = extract_url_text(url) + + # Skip URLs with no meaningful text extracted + if not url_text: + continue + + url_embedding = openai_client.get_embeddings([url_text])[0] + similarity = calculate_similarity(query_embedding, url_embedding) + + url_sims.append(UrlSimilarityInfo( + url=url, + similarity=similarity, + extracted_text=url_text + )) + + if similarity >= SIMILARITY_THRESHOLD: + valid_urls.append(url) + + results[domain] = valid_urls + url_similarities[domain] = sorted(url_sims, + key=lambda x: x.similarity, + reverse=True) + + except HTTPException as e: + error_messages[domain] = str(e.detail) + except Exception as e: + error_messages[domain] = f"Unexpected error for {domain}: {str(e)}" + + return SearchResponse( + results=results, + error_messages=error_messages, + url_similarities=url_similarities + ) \ No newline at end of file diff --git a/app/models/__pycache__/fact_check_models.cpython-312.pyc b/app/models/__pycache__/fact_check_models.cpython-312.pyc index e2c8b7cf78c3bf747b3006eb0a7078967671dd0e..239346e1dd0ca70c102289a9908f973365f9b466 100644 GIT binary patch literal 9611 zcmbVSTX5TEb_Pg*cT%KCQKBTupd?$O*ODwLcH*qoVKnZl3 z%FqcqV@X&t)`T^~B$y1FU}>zgq-_~{!k%#?92sZAnQ>a1x`#+&exGL!aY zdZv+0(MKjEh-3w4eXY#&mBU8Fy=8t+X62-g8zX9KsDa9zN4H*kZ5>jAE}fg2)R zA8=b5xNU^%2W~(Ni{8(e+WXoGzZLkwhSnW~8v<@y12;mrVc@nma61XN1Go{fv%Zcl z!tDfZR|B`3aJzvUZ5Sa+xIMrg?q#JjH39naUP2aCg7}qAF!XEEnW-o|hFdW$d6qYsW|JybRBGD3MCg z0+p~_2JuN)MMh*Jz!z_Oik+in0Zmx;HQ9 zGI=$rB45!1DR@f5>~dL?Vm=hh}~ zpW0yhwSmD3(^kH|ZoPeTgBjF%`&Q1E2iL}L4{b7ia+le^=`e6Gymr=rG#-WsMa|Y6)zH@=Iz=lvVn%S++PDOd z;1zsAi{KXmcNvMk*IFOVU3A8Sx_3&JbMl<1~`el+c}*fZUdT_|f4LM~^*w{8-5|LS%UeXQRX! z`2a4u7s()!I1o6Uz*Pm*Qh}x!J}F*TWj^kaaj9|`NjnhTs;IJzvLmBS-ZKW#NZF3< zJVb5aaM6hE*<=~q7)6K|3d$8Aph%vU4_IA)ze8#Y@C);R+Tw=7;!Bvq6-n1#+@G1VySs18IWj9K2G<$)XY)w*IpZS4|h za}G-QDJ?AZ7GP6u&_AdC#xhUcpg~Ejvybgd;EA3e8F}o$f&wKFW*5k-LoSJL%i$^%I-nq5FZ) zyMNjJ#nsKx8Lch4#;?tnPgguKt;N6UAO{zB?GZ$I7=~#O#BqE?hae*+lE06nS+(U+ zD&N3h>>nCF@Bf?a0> zC)R+MEYo>eBe0H$_&V3S$cHFo3Ovnxs z=bTO<56J-hl#4*%oLV|Im<}x(TbW$#C=Wbvc5W~+t?R(b)arBP$y?xi4`^IJ*bK*7 z@U|B{ao@nMUmkea5nW&0>=*`b+_%XL8)t~G*#8Xio!@zertX_(Xos^gdE^X5@Ue~d z0nWpwv*bK<26uT@cck!ict9^sQX(hdCPm}OzDwq3V1~y2B7~EjNXWkJ*~@cSfn)@J z3Z6hs=Cz*QmFd-`vT`f4!Sree`oO+kU$fo<*wUv3+RE=f2=s0;y~a-A08Kk(TAm$* zSi3ug@DU`HDO%%`Ab{>c$7A3rFo4dixU3LMF&k(1$dK-NX=3tHa&mfV z@|CGM-8nn;`h0SJ=F-$&DY*b|mXRETpW*@XA*BUlE9W(T*ZVWZys^4z-X=)cK?&TQ z_ho7qYl$rA5COh`x@>|K{xSF1I?lJl$ua`bzoI?v&|6+I;iIu!z(u(XFJp_RBZrj`jCS9a! zsFZC&rCx$cTVE4pX+cU6_1ciV8lgIOss7l4PH3hycnLkP&F_-i7sLI;^kOBtya24R03AC_3( zp-)-)9hj(`MDoW-cqCVlpksd0rW5HSeRe%A>5VF97#`A__+C5m(A4ae(zlU^1RaA? z0P+#_FxasctQ>xIGk9_3-0J)WbFl{hPH%=^FyP+{nlD(MTnnwGeikXuKJfLSb!ut- zU{>orWJXpS%%K|mYmaK3oEDj`bj)axi(2>jZMO|UpaVgmV;2PCJeowmo<=zjHT#Kv z%?RLLz?Cq>-*m!u!DhN(!QKG;97P92vmuQ3>qcTugiuLQhvnB{9O*x-6jkcRXn@E7 zr{=gTASz%{h6r94g}7CBWu$B}EoK)X7I0$S2RgAsH)$b$-!{oQDBKcr~lQ^l>q zER1UxJ^qm1A3d7A04-W(ONS?ow}Sv;^o^4jA=odXAYLsfDu^$@K_$c4Z32@1I$o?TtaNbS{oC8@O&q8X$O`pMi~lc|Z!LdZ{HnFP5}ny- zeYxU!8IUA0e_X6MI*9$-$LtbGzY1eC7+?Y>v&h?Hb{CO{i~))sMlqF%F;QS_IP6NETLoHG_A&xvty$!l^(|W`D z|FSmL{Sr7rW2_1zW2}19>ZDZkGvvBqH8=DobA!2R=F-e+xGTN}`^Fe!n0>&?S+2Sp z8rO=uqtevJX@8@=v;R>Uv5f$vC2Zahr@FQZ8XzpJ`%+Bnp6xj$5}G zO0CnPg1WI0kk2g>K1pjRigVEE37+b4ed z{jb7YC3bux{A|Vl?3RrRwcj~?`*fxE#Afh$t%F++uCtZ!p|W+0v3V!xuOhMazMn_S zHmyB+=jKmte%0Au8JPG@-*4I~2hVMEPFF(Hn!o3F{(}$v2iH?y`VVhK@O4`pnKl5wli_u?QyT4S4y|58}vEqO6zqYK{oS&p6L?v%fvnd8VEHr*;;~1PngzpUDampXC;Wr5Q6AQkGtj9vSwZ~Wt$|JW95ZS!rj`@%Kz zt?*9cGPQgM4SJ*5-Y<%4h|Ec}-tcV3FLr@AOv6~;wF(e-fWh`c+~Fgc@D@{l6ON)k z9?;wNK$2QUoCRnHnO-;f)R8RUHxS)!9G9Yd2%fpF0$&t}e3ZJTjKnTzGaO?E;}fv( z1g#ez39vL-0rKdtfk5=wii{uL~NcaA|GKBBtM3qf`|+vQMd1dm$g9G2GgZ^1LeM3 zZ*DMQtuh{_W}d2oiBhqRWKk0CpwH^l=$*mxDSF+=R=5Z3JWB9Yk@0M4|tu80)ygm z7sLgG#e$*gT`N#8enx?Ns}Q^kA6cLjf?t~u7TSdl5HY(D5jvq)*Ifsog6_J2xr)(v zkKXrCAy)t+Zm$6?J=hrN)!-KhU?G#?<)R^SUQ^`A2n4p3TAO$r zN=(T;Z$zq?sV;dCp>UJv16thcH!)e zL=;Bun5|aeyCOu$5NkOM)wqq_e}d$%knAc6(V3qj&rlcl-oy<7tteB5rl6fNaIG6l zPgR^U)*|@@{FHwL0*cdN>P&xGS(_|hzdiiG8^fe$B)XPbA75MgSGy4UsL0_;`(Z6QQt3XrM?*bSZ=W`T<97JApFM!;8AJ7q?W!lP zs7XC>7x3Cai@sMqMYf(FGQWA+@tdc>pt6DT*wH@UW#JhCHBE9u_zY@?`5~~_0ihLW zPza%-fs(b~#f(u$UAZtIkLbZBWMU|mangrjTJi@-aN35b#`Y1_MuW{?3gCdx;&(BK{Rt0102 z{&QUAUjQk!kwCuLRlv&jWsg1j7wm^RCmn$RfT$bfafw^mPh(J7o=kG?DhJ685v_Tu zhOFTmk^D3K6!ci2A*cwIzGIc1V-F(7H<;t5Gx8^NoK%^A7+fiJ~j&J*XWVG+op7NH@xQ56XJauB_wS zX5+n()-4L)dZm7rE14e|u?RA?IM;CDiQofJT$R3WJ7iT-uP1w zG;nI_(VYB%kL1RmWt^7(7KQ~Gpp<_D0?3%Ae@{()NlpEp8vl|S|C(z1nu>l+#i}+d z-CvGWDY#W#UfN!ssZwyOat=CBc0zWJ4pc)ndZZk$QgEyK8M<#3+Q6;qvC!S+U|FbA zP^dbr^jKNKk;bZ?7TUfBqr$D)@1Y}WC#n?Os@)cPg04hIvBv~mbsVJamB>()f_wEM w9e_?SG~B9(2B4;Ayh_2nI!DLgsdEI37;WF0rm4{33gy>A-+Mp>NCM%%0pGwD3;+NC delta 2687 zcmZuz-ESMm5x?UddB;zYBK1K^q8Un#e6dT_a^pI7KHS)Hq{^iV*GOR{F>~Rr=}Y(_ z@15#U50!)1fSkU_*cTTS+9W`N7;*sx$wN@|A83LUE~EnV0jZIv)}&N32$j{~3Rug804SJ#}6v5dfF;F zu6ow89ao=QaI9L@EE}?`PE{8x@YadxP{*p!Pqvh+av}uoT%}sO>iR9acB#fHro*38 zW_i_{mHqsdH|-q&lRNxV?|V=5fW&$ch7m?kA-l2H?OL^97cbL_S)gw^%rruL+cP%q zDz?MioX@igd#rK9!C?fo&U{apsAN%9W zNzhX-k)Ou53X*46Qc$X^hS%*sN3SeW+j-F}TM#w4pJJ@WT%}wqx=O_^xvEoNpn1c` zvgmIVVGO~Kupi-Zgs#OI(jg!X(DcF2Mz|q0{Na`2&E(tphClT7bR!VHKGTZF;B(q< z23)x$P*tPG15yKj1K7^wP zU5m>hhmhju{m-;^E{8kEH~jx7bj2x*MTp07n3&m%(g_6N9CwpFgCJIL5~VJe$B;u9 z=FX&fTlO`#LVJ4;gP69{de_sc}EwZK^hXJ0R@XI7zH|N;2tNQ%Z6gl>KA?=)pDBlJ6#Wr;OH9#t^?U z7GKUSjm5q^Y_0mvJ%?(!8SBkcE5 zk0V{hs=^^vhC`|~i6{U}Y->6lB@JO5p`$1$eowby5(N7LfE&{NSpTYh??8SlHg|1m z<;bQwcRxM)?x~M{u$9i;Q~Pf#n`-Xjst5R?%!zDvv@vmb+p7#lwnussfp3(=VP8{9B(%5AwmN&7-w3@N%;I8NM+g6V zXe8gUSk%1OhCB#19=F$}M*OjBXB*L>w|0&QwE0#=W9axidKFH)Bwq+Wk%i**6OHJ^ zPZSbZQ=qDXyHypahm0`)bvU`q;AoN<`eHu@xTCP=Q2rr+>nT%LUtm^^Sx&nuu+&7Q zQtNLDwA1kU z%g{>vud&IV7z}Z>Vy)`Xs>7b;mBi2}Zn~>2n2tkP)z!;pwX|rKs7TC1tXg)FZzP8J zzqK?!-TTY2BS6{M-I`c7$82{a1itmeSvc<>ahhD?8FmHsqvm(+U zx@0duC3I*Zh|Gw)qk^AZwfY)z2nG1r7x{n-jV68#LTtnmExv4b3Z%2 z;n}$I*}1Lk$$M(*_Nh(vxTi|*86kRTY5cMU>My&ALtNxGvbfrcX%k)`x1%~0-Zr=d-Mv-Ot zfi(4JY3hMA{wHbtE2-}*X=tRWhGlJarYXUznO5Y;?F&r_Ud;f1FgEsNO4e@sn-aX5 i$2{`zYJ8P6B@mi=5QbSz30}=qL{6^O|0TgoMEE~GjXT-^ diff --git a/app/models/fact_check_models.py b/app/models/fact_check_models.py index 6c85771..bec9977 100644 --- a/app/models/fact_check_models.py +++ b/app/models/fact_check_models.py @@ -1,7 +1,14 @@ from pydantic import BaseModel, Field, HttpUrl, validator, ConfigDict -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Any, Union from enum import Enum from datetime import datetime +from urllib.parse import urlparse + +# Common Models +class TokenUsage(BaseModel): + prompt_tokens: Optional[int] = 0 + completion_tokens: Optional[int] = 0 + total_tokens: Optional[int] = 0 class ErrorResponse(BaseModel): detail: str @@ -18,11 +25,7 @@ class ErrorResponse(BaseModel): } }) -class RequestValidationError(BaseModel): - loc: List[str] - msg: str - type: str - +# Fact Check Models class Publisher(BaseModel): name: str site: Optional[str] = Field(None, description="Publisher's website") @@ -47,11 +50,116 @@ class Claim(BaseModel): claimDate: Optional[str] = None claimReview: List[ClaimReview] -class FactCheckResponse(BaseModel): - query: str = Field(..., description="Original query that was fact-checked") - total_claims_found: int = Field(..., ge=0) - results: List[Claim] = Field(default_factory=list) - summary: Dict[str, int] = Field(...) +class SourceType(str, Enum): + FACT_CHECKER = "fact_checker" + NEWS_SITE = "news_site" + +class FactCheckSource(BaseModel): + domain: str + type: SourceType + priority: int = Field(default=1, ge=1, le=10) + +# Verification Models +class VerificationResult(BaseModel): + verdict: str = Field(..., description="True/False/Insufficient Information") + confidence: str = Field(..., description="High/Medium/Low") + evidence: Union[str, List[str]] + reasoning: str + missing_info: Optional[str] = None + + model_config = ConfigDict(json_schema_extra={ + "example": { + "verdict": "True", + "confidence": "High", + "evidence": ["Direct quote from source supporting the claim"], + "reasoning": "Detailed analysis of why the claim is considered true", + "missing_info": "Any caveats or limitations of the verification" + } + }) + +# Request Models +class BaseFactCheckRequest(BaseModel): + content: str = Field( + ..., + min_length=10, + max_length=1000, + description="The claim to be fact-checked" + ) + + @validator('content') + def validate_content(cls, v): + if not v.strip(): + raise ValueError("Content cannot be empty or just whitespace") + return v.strip() + +class GoogleFactCheckRequest(BaseFactCheckRequest): + language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$") + max_results_per_source: int = Field(default=10, ge=1, le=50) + +class AIFactCheckRequest(BaseFactCheckRequest): + urls: List[str] = Field( + ..., + min_items=1, + max_items=5, + description="List of URLs to check the content against. URLs will be prefixed with https:// if protocol is missing" + ) + + @validator('urls') + def validate_urls(cls, urls): + validated_urls = [] + for url in urls: + if not url.strip(): + raise ValueError("URL cannot be empty") + + # Add https:// if no protocol specified + if not url.startswith(('http://', 'https://')): + url = f'https://{url}' + + try: + result = urlparse(url) + if not result.netloc: + raise ValueError(f"Invalid URL structure for {url}") + validated_urls.append(url) + except Exception as e: + raise ValueError(f"Invalid URL {url}: {str(e)}") + + return validated_urls + + model_config = ConfigDict(json_schema_extra={ + "example": { + "content": "Indian flag was drawn in BUET campus", + "urls": [ + "www.altnews.in/article-about-flag", + "www.another-source.com/related-news" + ] + } + }) + +# Response Models +class BaseFactCheckResponse(BaseModel): + query: str + token_usage: TokenUsage + sources: List[str] + context_used: List[str] + + model_config = ConfigDict(json_schema_extra={ + "example": { + "query": "Example statement to verify", + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + }, + "sources": ["source1.com", "source2.com"], + "context_used": ["Relevant context from sources"] + } + }) + +class GoogleFactCheckResponse(BaseFactCheckResponse): + total_claims_found: int + results: List[Dict[str, Any]] + verification_result: Dict[str, Any] + summary: Dict[str, int] model_config = ConfigDict(json_schema_extra={ "example": { @@ -68,6 +176,19 @@ class FactCheckResponse(BaseModel): "textualRating": "True" }] }], + "verification_result": { + "verdict": "True", + "confidence": "High", + "evidence": ["Supporting evidence"], + "reasoning": "Detailed analysis" + }, + "sources": ["factchecker.com"], + "context_used": ["Relevant context"], + "token_usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + }, "summary": { "total_sources": 1, "fact_checking_sites_queried": 10 @@ -75,35 +196,41 @@ class FactCheckResponse(BaseModel): } }) -class SourceType(str, Enum): - FACT_CHECKER = "fact_checker" - NEWS_SITE = "news_site" - -class FactCheckSource(BaseModel): - domain: str - type: SourceType - priority: int = Field(default=1, ge=1, le=10) +class AIFactCheckResponse(BaseFactCheckResponse): + verification_result: Dict[str, VerificationResult] # Changed to Dict to store results per URL model_config = ConfigDict(json_schema_extra={ "example": { - "domain": "factcheck.org", - "type": "fact_checker", - "priority": 1 + "query": "Indian flag was drawn in BUET campus", + "verification_result": { + "https://www.source1.com": { + "verdict": "True", + "confidence": "High", + "evidence": ["Supporting evidence from source 1"], + "reasoning": "Detailed analysis from source 1", + "missing_info": None + }, + "https://www.source2.com": { + "verdict": "True", + "confidence": "Medium", + "evidence": ["Supporting evidence from source 2"], + "reasoning": "Analysis from source 2", + "missing_info": "Additional context needed" + } + }, + "sources": ["source1.com", "source2.com"], + "context_used": [ + "Context from source 1", + "Context from source 2" + ], + "token_usage": { + "prompt_tokens": 200, + "completion_tokens": 100, + "total_tokens": 300 + } } }) -class FactCheckRequest(BaseModel): - content: str = Field( - ..., - min_length=10, - max_length=1000, - description="The claim to be fact-checked" - ) - language: str = Field(default="en-US", pattern="^[a-z]{2}-[A-Z]{2}$") - max_results_per_source: int = Field(default=10, ge=1, le=50) - - @validator('content') - def validate_content(cls, v): - if not v.strip(): - raise ValueError("Content cannot be empty or just whitespace") - return v.strip() \ No newline at end of file +# Backwards compatibility aliases +FactCheckRequest = GoogleFactCheckRequest +FactCheckResponse = GoogleFactCheckResponse \ No newline at end of file diff --git a/app/services/openai_client.py b/app/services/openai_client.py new file mode 100644 index 0000000..22541bb --- /dev/null +++ b/app/services/openai_client.py @@ -0,0 +1,173 @@ +from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_community.document_transformers import BeautifulSoupTransformer +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.documents import Document +from typing import List, Dict, Any +import numpy as np +import logging as logger +import openai +import json + +class OpenAIClient: + def __init__(self, api_key: str): + """ + Initialize OpenAI client with the provided API key. + """ + openai.api_key = api_key + + async def generate_text_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> dict: + """ + Generate a response using OpenAI's chat completion API. + """ + try: + response = openai.ChatCompletion.create( + model="gpt-4", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + max_tokens=max_tokens + ) + content = response['choices'][0]['message']['content'] + # Parse the JSON string into a dictionary + parsed_content = json.loads(content) + + return { + "response": parsed_content, # Now returns a dictionary instead of string + "prompt_tokens": response['usage']['prompt_tokens'], + "completion_tokens": response['usage']['completion_tokens'], + "total_tokens": response['usage']['total_tokens'] + } + except json.JSONDecodeError as e: + raise Exception(f"Failed to parse OpenAI response as JSON: {str(e)}") + except Exception as e: + raise Exception(f"OpenAI text generation error: {str(e)}") + + def get_embeddings(self, texts: List[str]) -> List[List[float]]: + """ + Retrieve embeddings for a list of texts using OpenAI's embedding API. + """ + try: + response = openai.Embedding.create( + input=texts, + model="text-embedding-ada-002" + ) + embeddings = [data['embedding'] for data in response['data']] + return embeddings + except Exception as e: + raise Exception(f"OpenAI embedding error: {str(e)}") + +class AIFactChecker: + def __init__(self, openai_client: OpenAIClient): + """Initialize the fact checker with OpenAI client.""" + self.openai_client = openai_client + self.text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200, + length_function=len, + separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] + ) + + async def scrape_webpage(self, url: str) -> List[Document]: + """Scrape webpage content using LangChain's AsyncHtmlLoader.""" + try: + loader = AsyncHtmlLoader([url]) + docs = await loader.aload() + + bs_transformer = BeautifulSoupTransformer() + docs_transformed = bs_transformer.transform_documents(docs) + docs_chunks = self.text_splitter.split_documents(docs_transformed) + + logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}") + return docs_chunks + + except Exception as e: + logger.error(f"Error scraping webpage | url={url} | error={str(e)}") + raise + + def find_relevant_chunks( + self, + query_embedding: List[float], + doc_embeddings: List[List[float]], + docs: List[Document] + ) -> List[Document]: + """Find most relevant document chunks using cosine similarity.""" + try: + query_array = np.array(query_embedding) + chunks_array = np.array(doc_embeddings) + + similarities = np.dot(chunks_array, query_array) / ( + np.linalg.norm(chunks_array, axis=1) * np.linalg.norm(query_array) + ) + + top_indices = np.argsort(similarities)[-5:][::-1] + return [docs[i] for i in top_indices] + + except Exception as e: + logger.error(f"Error finding relevant chunks | error={str(e)}") + raise + + async def verify_fact(self, query: str, relevant_docs: List[Document]) -> Dict[str, Any]: + """Verify fact using OpenAI's API with context from relevant documents.""" + try: + context = "\n\n".join([doc.page_content for doc in relevant_docs]) + + system_prompt = """You are a professional fact-checking assistant. Analyze the provided context + and determine if the given statement is true, false, or if there isn't enough information. + + Provide your response in the following JSON format: + { + "verdict": "True/False/Insufficient Information", + "confidence": "High/Medium/Low", + "evidence": "Direct quotes or evidence from the context", + "reasoning": "Your detailed analysis and reasoning", + "missing_info": "Any important missing information (if applicable)" + }""" + + user_prompt = f"""Context: + {context} + + Statement to verify: "{query}" + + Analyze the statement based on the provided context and return your response in the specified JSON format.""" + + response = await self.openai_client.generate_text_response( + system_prompt=system_prompt, + user_prompt=user_prompt, + max_tokens=800 + ) + + sources = list(set([doc.metadata.get('source', 'Unknown source') for doc in relevant_docs])) + + return { + "verification_result": response["response"], # This is now a dictionary + "sources": sources, + "context_used": [doc.page_content for doc in relevant_docs], + "token_usage": { + "prompt_tokens": response["prompt_tokens"], + "completion_tokens": response["completion_tokens"], + "total_tokens": response["total_tokens"] + } + } + + except Exception as e: + logger.error(f"Error verifying fact | error={str(e)}") + raise + + async def check_fact(self, url: str, query: str) -> Dict[str, Any]: + """Main method to check a fact against a webpage.""" + try: + docs = await self.scrape_webpage(url) + + doc_texts = [doc.page_content for doc in docs] + doc_embeddings = self.openai_client.get_embeddings(doc_texts) + query_embedding = self.openai_client.get_embeddings([query]) + + relevant_docs = self.find_relevant_chunks(query_embedding[0], doc_embeddings, docs) + verification_result = await self.verify_fact(query, relevant_docs) + + return verification_result + + except Exception as e: + logger.error(f"Error checking fact | error={str(e)}") + raise \ No newline at end of file diff --git a/main.py b/main.py index 6b79e28..25d68c4 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from app.api.fact_check import fact_check_router +from app.api.ai_fact_check import aifact_check_router +from app.api.scrap_websites import scrap_websites_router from app.config import FRONTEND_URL # Initialize FastAPI app @@ -39,6 +41,8 @@ async def health_check(): return {"status": "healthy"} app.include_router(fact_check_router, prefix="") +app.include_router(aifact_check_router, prefix="") +app.include_router(scrap_websites_router, prefix="") # Include routers (uncomment and modify as needed) # from routes import some_router