content fact checked is functional

This commit is contained in:
Utsho Dey 2024-12-17 18:05:50 +06:00
parent e56163a8c3
commit d59f5c884e
5 changed files with 82 additions and 98 deletions

View file

@ -1,4 +1,3 @@
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document from langchain_core.documents import Document
@ -7,6 +6,8 @@ import numpy as np
import logging as logger import logging as logger
import openai import openai
import json import json
import aiohttp
from bs4 import BeautifulSoup
class OpenAIClient: class OpenAIClient:
def __init__(self, api_key: str): def __init__(self, api_key: str):
@ -67,20 +68,32 @@ class AIFactChecker:
length_function=len, length_function=len,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
) )
async def scrape_webpage(self, url: str) -> List[Document]: async def scrape_webpage(self, url: str) -> List[Document]:
"""Scrape webpage content using LangChain's AsyncHtmlLoader.""" """Scrape webpage content without saving HTML files."""
try: try:
loader = AsyncHtmlLoader([url]) async with aiohttp.ClientSession() as session:
docs = await loader.aload() async with session.get(url) as response:
if response.status != 200:
bs_transformer = BeautifulSoupTransformer() raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
docs_transformed = bs_transformer.transform_documents(docs)
docs_chunks = self.text_splitter.split_documents(docs_transformed) html_content = await response.text()
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}") # Parse HTML with BeautifulSoup
return docs_chunks soup = BeautifulSoup(html_content, 'html.parser')
# Create a Document with the parsed content
doc = Document(
page_content=soup.get_text(separator='\n', strip=True),
metadata={"source": url}
)
# Split into chunks
docs_chunks = self.text_splitter.split_documents([doc])
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
return docs_chunks
except Exception as e: except Exception as e:
logger.error(f"Error scraping webpage | url={url} | error={str(e)}") logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
raise raise

View file

@ -1,6 +1,61 @@
certifi==2024.8.30 aiofiles==24.1.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.10
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.7.0
attrs==24.3.0
beautifulsoup4==4.12.3
certifi==2024.12.14
charset-normalizer==3.4.0 charset-normalizer==3.4.0
click==8.1.7
dataclasses-json==0.6.7
dnspython==2.7.0
email_validator==2.2.0
fastapi==0.115.6
frozenlist==1.5.0
greenlet==3.1.1
gunicorn==23.0.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
httpx-sse==0.4.0
idna==3.10 idna==3.10
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.12
langchain-community==0.3.12
langchain-core==0.3.25
langchain-text-splitters==0.3.3
langsmith==0.2.3
marshmallow==3.23.1
multidict==6.1.0
mypy-extensions==1.0.0
numpy==2.2.0
openai==0.28.0
orjson==3.10.12
packaging==24.2
propcache==0.2.1
pydantic==2.10.3
pydantic-settings==2.7.0
pydantic_core==2.27.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1 python-dotenv==1.0.1
python-json-logger==3.2.1
python-multipart==0.0.20
PyYAML==6.0.2
requests==2.32.3 requests==2.32.3
requests-toolbelt==1.0.0
six==1.17.0
sniffio==1.3.1
soupsieve==2.6
SQLAlchemy==2.0.36
starlette==0.41.3
tenacity==9.0.0
tqdm==4.67.1
typing-inspect==0.9.0
typing_extensions==4.12.2
ujson==5.10.0
urllib3==2.2.3 urllib3==2.2.3
uvicorn==0.34.0
yarl==1.18.3

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long