content fact checked is functional
This commit is contained in:
parent
e56163a8c3
commit
d59f5c884e
5 changed files with 82 additions and 98 deletions
|
|
@ -1,4 +1,3 @@
|
||||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
|
||||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
from langchain_community.document_transformers import BeautifulSoupTransformer
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
@ -7,6 +6,8 @@ import numpy as np
|
||||||
import logging as logger
|
import logging as logger
|
||||||
import openai
|
import openai
|
||||||
import json
|
import json
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
class OpenAIClient:
|
class OpenAIClient:
|
||||||
def __init__(self, api_key: str):
|
def __init__(self, api_key: str):
|
||||||
|
|
@ -67,20 +68,32 @@ class AIFactChecker:
|
||||||
length_function=len,
|
length_function=len,
|
||||||
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
||||||
)
|
)
|
||||||
|
|
||||||
async def scrape_webpage(self, url: str) -> List[Document]:
|
async def scrape_webpage(self, url: str) -> List[Document]:
|
||||||
"""Scrape webpage content using LangChain's AsyncHtmlLoader."""
|
"""Scrape webpage content without saving HTML files."""
|
||||||
try:
|
try:
|
||||||
loader = AsyncHtmlLoader([url])
|
async with aiohttp.ClientSession() as session:
|
||||||
docs = await loader.aload()
|
async with session.get(url) as response:
|
||||||
|
if response.status != 200:
|
||||||
bs_transformer = BeautifulSoupTransformer()
|
raise Exception(f"Failed to fetch URL: {url}, status: {response.status}")
|
||||||
docs_transformed = bs_transformer.transform_documents(docs)
|
|
||||||
docs_chunks = self.text_splitter.split_documents(docs_transformed)
|
html_content = await response.text()
|
||||||
|
|
||||||
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
|
# Parse HTML with BeautifulSoup
|
||||||
return docs_chunks
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Create a Document with the parsed content
|
||||||
|
doc = Document(
|
||||||
|
page_content=soup.get_text(separator='\n', strip=True),
|
||||||
|
metadata={"source": url}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split into chunks
|
||||||
|
docs_chunks = self.text_splitter.split_documents([doc])
|
||||||
|
|
||||||
|
logger.info(f"Successfully scraped webpage | chunks={len(docs_chunks)}")
|
||||||
|
return docs_chunks
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
|
logger.error(f"Error scraping webpage | url={url} | error={str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,61 @@
|
||||||
certifi==2024.8.30
|
aiofiles==24.1.0
|
||||||
|
aiohappyeyeballs==2.4.4
|
||||||
|
aiohttp==3.11.10
|
||||||
|
aiosignal==1.3.2
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.7.0
|
||||||
|
attrs==24.3.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
certifi==2024.12.14
|
||||||
charset-normalizer==3.4.0
|
charset-normalizer==3.4.0
|
||||||
|
click==8.1.7
|
||||||
|
dataclasses-json==0.6.7
|
||||||
|
dnspython==2.7.0
|
||||||
|
email_validator==2.2.0
|
||||||
|
fastapi==0.115.6
|
||||||
|
frozenlist==1.5.0
|
||||||
|
greenlet==3.1.1
|
||||||
|
gunicorn==23.0.0
|
||||||
|
h11==0.14.0
|
||||||
|
httpcore==1.0.7
|
||||||
|
httpx==0.28.1
|
||||||
|
httpx-sse==0.4.0
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
jsonpatch==1.33
|
||||||
|
jsonpointer==3.0.0
|
||||||
|
langchain==0.3.12
|
||||||
|
langchain-community==0.3.12
|
||||||
|
langchain-core==0.3.25
|
||||||
|
langchain-text-splitters==0.3.3
|
||||||
|
langsmith==0.2.3
|
||||||
|
marshmallow==3.23.1
|
||||||
|
multidict==6.1.0
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
numpy==2.2.0
|
||||||
|
openai==0.28.0
|
||||||
|
orjson==3.10.12
|
||||||
|
packaging==24.2
|
||||||
|
propcache==0.2.1
|
||||||
|
pydantic==2.10.3
|
||||||
|
pydantic-settings==2.7.0
|
||||||
|
pydantic_core==2.27.1
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
python-json-logger==3.2.1
|
||||||
|
python-multipart==0.0.20
|
||||||
|
PyYAML==6.0.2
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
|
requests-toolbelt==1.0.0
|
||||||
|
six==1.17.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
soupsieve==2.6
|
||||||
|
SQLAlchemy==2.0.36
|
||||||
|
starlette==0.41.3
|
||||||
|
tenacity==9.0.0
|
||||||
|
tqdm==4.67.1
|
||||||
|
typing-inspect==0.9.0
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
ujson==5.10.0
|
||||||
urllib3==2.2.3
|
urllib3==2.2.3
|
||||||
|
uvicorn==0.34.0
|
||||||
|
yarl==1.18.3
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue