content fact checked is functional

This commit is contained in:
Utsho Dey 2024-12-17 18:05:50 +06:00
parent e56163a8c3
commit d59f5c884e
5 changed files with 82 additions and 98 deletions

View file

@ -1,4 +1,3 @@
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
@ -7,6 +6,8 @@ import numpy as np
import logging as logger
import openai
import json
import aiohttp
from bs4 import BeautifulSoup
class OpenAIClient:
def __init__(self, api_key: str):
@ -69,14 +70,26 @@ class AIFactChecker:
)
async def scrape_webpage(self, url: str) -> List[Document]:
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
"""Scrape webpage content without saving HTML files."""
try:
loader = AsyncHtmlLoader([url])
docs = await loader.aload()
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs)
docs_chunks = self.text_splitter.split_documents(docs_transformed)
html_content = await response.text()
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Create a Document with the parsed content
doc = Document(
page_content=soup.get_text(separator='\n', strip=True),
metadata={"source": url}
)
# Split into chunks
docs_chunks = self.text_splitter.split_documents([doc])
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
return docs_chunks

View file

@ -1,6 +1,61 @@
certifi==2024.8.30
aiofiles==24.1.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.10
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.7.0
attrs==24.3.0
beautifulsoup4==4.12.3
certifi==2024.12.14
charset-normalizer==3.4.0
click==8.1.7
dataclasses-json==0.6.7
dnspython==2.7.0
email_validator==2.2.0
fastapi==0.115.6
frozenlist==1.5.0
greenlet==3.1.1
gunicorn==23.0.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
httpx-sse==0.4.0
idna==3.10
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.12
langchain-community==0.3.12
langchain-core==0.3.25
langchain-text-splitters==0.3.3
langsmith==0.2.3
marshmallow==3.23.1
multidict==6.1.0
mypy-extensions==1.0.0
numpy==2.2.0
openai==0.28.0
orjson==3.10.12
packaging==24.2
propcache==0.2.1
pydantic==2.10.3
pydantic-settings==2.7.0
pydantic_core==2.27.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-json-logger==3.2.1
python-multipart==0.0.20
PyYAML==6.0.2
requests==2.32.3
requests-toolbelt==1.0.0
six==1.17.0
sniffio==1.3.1
soupsieve==2.6
SQLAlchemy==2.0.36
starlette==0.41.3
tenacity==9.0.0
tqdm==4.67.1
typing-inspect==0.9.0
typing_extensions==4.12.2
ujson==5.10.0
urllib3==2.2.3
uvicorn==0.34.0
yarl==1.18.3

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long