content fact checked is functional
This commit is contained in:
parent
e56163a8c3
commit
d59f5c884e
5 changed files with 82 additions and 98 deletions
|
|
@ -1,4 +1,3 @@
|
|||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.documents import Document
|
||||
|
|
@ -7,6 +6,8 @@ import numpy as np
|
|||
import logging as logger
|
||||
import openai
|
||||
import json
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class OpenAIClient:
|
||||
def __init__(self, api_key: str):
|
||||
|
|
@ -69,14 +70,26 @@ class AIFactChecker:
|
|||
)
|
||||
|
||||
async def scrape_webpage(self, url: str) -> List[Document]:
|
||||
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
|
||||
"""Scrape webpage content without saving HTML files."""
|
||||
try:
|
||||
loader = AsyncHtmlLoader([url])
|
||||
docs = await loader.aload()
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
|
||||
|
||||
bs_transformer = BeautifulSoupTransformer()
|
||||
docs_transformed = bs_transformer.transform_documents(docs)
|
||||
docs_chunks = self.text_splitter.split_documents(docs_transformed)
|
||||
html_content = await response.text()
|
||||
|
||||
# Parse HTML with BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Create a Document with the parsed content
|
||||
doc = Document(
|
||||
page_content=soup.get_text(separator='\n', strip=True),
|
||||
metadata={"source": url}
|
||||
)
|
||||
|
||||
# Split into chunks
|
||||
docs_chunks = self.text_splitter.split_documents([doc])
|
||||
|
||||
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
|
||||
return docs_chunks
|
||||
|
|
|
|||
|
|
@ -1,6 +1,61 @@
|
|||
certifi==2024.8.30
|
||||
aiofiles==24.1.0
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.10
|
||||
aiosignal==1.3.2
|
||||
annotated-types==0.7.0
|
||||
anyio==4.7.0
|
||||
attrs==24.3.0
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.12.14
|
||||
charset-normalizer==3.4.0
|
||||
click==8.1.7
|
||||
dataclasses-json==0.6.7
|
||||
dnspython==2.7.0
|
||||
email_validator==2.2.0
|
||||
fastapi==0.115.6
|
||||
frozenlist==1.5.0
|
||||
greenlet==3.1.1
|
||||
gunicorn==23.0.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.7
|
||||
httpx==0.28.1
|
||||
httpx-sse==0.4.0
|
||||
idna==3.10
|
||||
jsonpatch==1.33
|
||||
jsonpointer==3.0.0
|
||||
langchain==0.3.12
|
||||
langchain-community==0.3.12
|
||||
langchain-core==0.3.25
|
||||
langchain-text-splitters==0.3.3
|
||||
langsmith==0.2.3
|
||||
marshmallow==3.23.1
|
||||
multidict==6.1.0
|
||||
mypy-extensions==1.0.0
|
||||
numpy==2.2.0
|
||||
openai==0.28.0
|
||||
orjson==3.10.12
|
||||
packaging==24.2
|
||||
propcache==0.2.1
|
||||
pydantic==2.10.3
|
||||
pydantic-settings==2.7.0
|
||||
pydantic_core==2.27.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-json-logger==3.2.1
|
||||
python-multipart==0.0.20
|
||||
PyYAML==6.0.2
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.6
|
||||
SQLAlchemy==2.0.36
|
||||
starlette==0.41.3
|
||||
tenacity==9.0.0
|
||||
tqdm==4.67.1
|
||||
typing-inspect==0.9.0
|
||||
typing_extensions==4.12.2
|
||||
ujson==5.10.0
|
||||
urllib3==2.2.3
|
||||
uvicorn==0.34.0
|
||||
yarl==1.18.3
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue