content fact checked is functional

2024-12-17 18:05:50 +06:00 · 2024-12-17 18:05:50 +06:00 · d59f5c884e
commit d59f5c884e
parent e56163a8c3
5 changed files with 82 additions and 98 deletions
--- a/app/services/openai_client.py
+++ b/app/services/openai_client.py
@ -1,4 +1,3 @@
-from langchain_community.document_loaders import AsyncHtmlLoader
 from langchain_community.document_transformers import BeautifulSoupTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
@ -7,6 +6,8 @@ import numpy as np
 import logging as logger
 import openai
 import json
+import aiohttp
+from bs4 import BeautifulSoup

 class OpenAIClient:
    def __init__(self, api_key: str):
@ -69,14 +70,26 @@ class AIFactChecker:
        )
    
    async def scrape_webpage(self, url: str) -> List[Document]:
-        """Scrape webpage content using LangChain's AsyncHtmlLoader."""
+        """Scrape webpage content without saving HTML files."""
        try:
-            loader = AsyncHtmlLoader([url])
-            docs = await loader.aload()
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url) as response:
+                    if response.status != 200:
+                        raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
                    
-            bs_transformer = BeautifulSoupTransformer()
-            docs_transformed = bs_transformer.transform_documents(docs)
-            docs_chunks = self.text_splitter.split_documents(docs_transformed)
+                    html_content = await response.text()
+                    
+                    # Parse HTML with BeautifulSoup
+                    soup = BeautifulSoup(html_content, 'html.parser')
+                    
+                    # Create a Document with the parsed content
+                    doc = Document(
+                        page_content=soup.get_text(separator='\n', strip=True),
+                        metadata={"source": url}
+                    )
+                    
+                    # Split into chunks
+                    docs_chunks = self.text_splitter.split_documents([doc])
                    
                    logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
                    return docs_chunks
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,61 @@
-certifi==2024.8.30
+aiofiles==24.1.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.10
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.7.0
+attrs==24.3.0
+beautifulsoup4==4.12.3
+certifi==2024.12.14
 charset-normalizer==3.4.0
+click==8.1.7
+dataclasses-json==0.6.7
+dnspython==2.7.0
+email_validator==2.2.0
+fastapi==0.115.6
+frozenlist==1.5.0
+greenlet==3.1.1
+gunicorn==23.0.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+httpx-sse==0.4.0
 idna==3.10
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.3.12
+langchain-community==0.3.12
+langchain-core==0.3.25
+langchain-text-splitters==0.3.3
+langsmith==0.2.3
+marshmallow==3.23.1
+multidict==6.1.0
+mypy-extensions==1.0.0
+numpy==2.2.0
+openai==0.28.0
+orjson==3.10.12
+packaging==24.2
+propcache==0.2.1
+pydantic==2.10.3
+pydantic-settings==2.7.0
+pydantic_core==2.27.1
+python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
+python-json-logger==3.2.1
+python-multipart==0.0.20
+PyYAML==6.0.2
 requests==2.32.3
+requests-toolbelt==1.0.0
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.36
+starlette==0.41.3
+tenacity==9.0.0
+tqdm==4.67.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+ujson==5.10.0
 urllib3==2.2.3
+uvicorn==0.34.0
+yarl==1.18.3
--- a/search_response_altnews_in.html
+++ b/search_response_altnews_in.html
--- a/search_response_bbc_com.html
+++ b/search_response_bbc_com.html
--- a/search_response_en_prothomalo_com.html
+++ b/search_response_en_prothomalo_com.html