Merge pull request #2 from aravindkarnam/staging

Staging
This commit is contained in:
aravind
2024-09-18 18:16:23 +05:30
committed by GitHub
7 changed files with 54 additions and 12 deletions

View File

@@ -12,6 +12,7 @@ import json, uuid
import hashlib
from pathlib import Path
from playwright.async_api import ProxySettings
from pydantic import BaseModel
def calculate_semaphore_count():
cpu_count = os.cpu_count()
@@ -20,13 +21,18 @@ def calculate_semaphore_count():
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
return min(base_count, memory_based_cap)
class AsyncCrawlResponse(BaseModel):
html: str
response_headers: Dict[str, str]
status_code: int
class AsyncCrawlerStrategy(ABC):
@abstractmethod
async def crawl(self, url: str, **kwargs) -> str:
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
pass
@abstractmethod
async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
pass
@abstractmethod
@@ -140,7 +146,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))
async def crawl(self, url: str, **kwargs) -> str:
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
response_headers = {}
status_code = None
self._cleanup_expired_sessions()
session_id = kwargs.get("session_id")
if session_id:
@@ -168,13 +177,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
if os.path.exists(cache_file_path):
html = ""
with open(cache_file_path, "r") as f:
return f.read()
html = f.read()
# retrieve response headers and status code from cache
with open(cache_file_path + ".meta", "r") as f:
meta = json.load(f)
response_headers = meta.get("response_headers", {})
status_code = meta.get("status_code")
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
return response
if not kwargs.get("js_only", False):
await self.execute_hook('before_goto', page)
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
await self.execute_hook('after_goto', page)
# Get status code and headers
status_code = response.status
response_headers = response.headers
await page.wait_for_selector('body')
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
@@ -202,8 +223,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html)
# store response headers and status code in cache
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
json.dump({
"response_headers": response_headers,
"status_code": status_code
}, f)
return html
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
return response
except Error as e:
raise Error(f"Failed to crawl {url}: {str(e)}")
finally:
@@ -218,7 +246,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# except Exception as e:
# raise Exception(f"Failed to crawl {url}: {str(e)}")
async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
semaphore = asyncio.Semaphore(semaphore_count)

View File

@@ -8,7 +8,7 @@ from .models import CrawlResult
from .async_database import async_db_manager
from .chunking_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .content_scrapping_strategy import WebScrappingStrategy
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
from .utils import (
@@ -101,7 +101,8 @@ class AsyncWebCrawler:
t1 = time.time()
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
html = await self.crawler_strategy.crawl(url, **kwargs)
async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
html = sanitize_input_encode(async_response.html)
t2 = time.time()
if verbose:
print(
@@ -121,8 +122,11 @@ class AsyncWebCrawler:
screenshot_data,
verbose,
bool(cached),
async_response=async_response,
**kwargs,
)
crawl_result.status_code = async_response.status_code
crawl_result.responser_headers = async_response.response_headers
crawl_result.success = bool(html)
crawl_result.session_id = kwargs.get("session_id", None)
return crawl_result

View File

@@ -43,6 +43,10 @@ class WebScrappingStrategy(ContentScrappingStrategy):
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
for tag in kwargs.get('excluded_tags', []) or []:
for el in body.select(tag):
el.decompose()
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:

View File

@@ -17,4 +17,6 @@ class CrawlResult(BaseModel):
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
session_id: Optional[str] = None
session_id: Optional[str] = None
responser_headers: Optional[dict] = None
status_code: Optional[int] = None

View File

@@ -441,6 +441,10 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
for tag in kwargs.get('excluded_tags', []) or []:
for el in body.select(tag):
el.decompose()
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:

View File

@@ -50,7 +50,7 @@
<body>
<header class="bg-gray-900 text-white py-4">
<div class="container mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper</h1>
</div>
</header>

View File

@@ -26,7 +26,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
setup(
name="Crawl4AI",
version="0.2.77",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
url="https://github.com/unclecode/crawl4ai",