content fact checked is functional

2024-12-17 18:05:50 +06:00 · 2024-12-17 18:05:50 +06:00 · d59f5c884e
commit d59f5c884e
parent e56163a8c3
5 changed files with 82 additions and 98 deletions
--- a/app/services/openai_client.py
+++ b/app/services/openai_client.py
@ -1,4 +1,3 @@
 from langchain_community.document_loaders import AsyncHtmlLoader
 from langchain_community.document_transformers import BeautifulSoupTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
@ -7,6 +6,8 @@ import numpy as np
 import logging as logger
 import openai
 import json
 import aiohttp
 from bs4 import BeautifulSoup
 class OpenAIClient:
    def __init__(self, api_key: str):
@ -67,20 +68,32 @@ class AIFactChecker:
            length_function=len,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
-        
+    
    async def scrape_webpage(self, url: str) -> List[Document]:
-        """Scrape webpage content using LangChain's AsyncHtmlLoader."""
+        """Scrape webpage content without saving HTML files."""
        try:
-            loader = AsyncHtmlLoader([url])
+            async with aiohttp.ClientSession() as session:
-            docs = await loader.aload()
+                async with session.get(url) as response:
-            
+                    if response.status != 200:
-            bs_transformer = BeautifulSoupTransformer()
+                        raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
-            docs_transformed = bs_transformer.transform_documents(docs)
+                    
-            docs_chunks = self.text_splitter.split_documents(docs_transformed)
+                    html_content = await response.text()
-            
+                    
-            logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
+                    # Parse HTML with BeautifulSoup
-            return docs_chunks
+                    soup = BeautifulSoup(html_content, 'html.parser')
-            
+                    
                    # Create a Document with the parsed content
                    doc = Document(
                        page_content=soup.get_text(separator='\n', strip=True),
                        metadata={"source": url}
                    )
                    # Split into chunks
                    docs_chunks = self.text_splitter.split_documents([doc])
                    logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
                    return docs_chunks
        except Exception as e:
            logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
            raise
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,61 @@
-certifi==2024.8.30
+aiofiles==24.1.0
 aiohappyeyeballs==2.4.4
 aiohttp==3.11.10
 aiosignal==1.3.2
 annotated-types==0.7.0
 anyio==4.7.0
 attrs==24.3.0
 beautifulsoup4==4.12.3
 certifi==2024.12.14
 charset-normalizer==3.4.0
 click==8.1.7
 dataclasses-json==0.6.7
 dnspython==2.7.0
 email_validator==2.2.0
 fastapi==0.115.6
 frozenlist==1.5.0
 greenlet==3.1.1
 gunicorn==23.0.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
 httpx-sse==0.4.0
 idna==3.10
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.3.12
 langchain-community==0.3.12
 langchain-core==0.3.25
 langchain-text-splitters==0.3.3
 langsmith==0.2.3
 marshmallow==3.23.1
 multidict==6.1.0
 mypy-extensions==1.0.0
 numpy==2.2.0
 openai==0.28.0
 orjson==3.10.12
 packaging==24.2
 propcache==0.2.1
 pydantic==2.10.3
 pydantic-settings==2.7.0
 pydantic_core==2.27.1
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 python-json-logger==3.2.1
 python-multipart==0.0.20
 PyYAML==6.0.2
 requests==2.32.3
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 soupsieve==2.6
 SQLAlchemy==2.0.36
 starlette==0.41.3
 tenacity==9.0.0
 tqdm==4.67.1
 typing-inspect==0.9.0
 typing_extensions==4.12.2
 ujson==5.10.0
 urllib3==2.2.3
 uvicorn==0.34.0
 yarl==1.18.3
--- a/search_response_altnews_in.html
+++ b/search_response_altnews_in.html
--- a/search_response_bbc_com.html
+++ b/search_response_bbc_com.html
--- a/search_response_en_prothomalo_com.html
+++ b/search_response_en_prothomalo_com.html