@@ -12,6 +12,7 @@ import json, uuid
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from playwright.async_api import ProxySettings
|
||||
from pydantic import BaseModel
|
||||
|
||||
def calculate_semaphore_count():
|
||||
cpu_count = os.cpu_count()
|
||||
@@ -20,13 +21,18 @@ def calculate_semaphore_count():
|
||||
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
|
||||
return min(base_count, memory_based_cap)
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
status_code: int
|
||||
|
||||
class AsyncCrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
async def crawl(self, url: str, **kwargs) -> str:
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
@@ -140,7 +146,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> str:
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
|
||||
self._cleanup_expired_sessions()
|
||||
session_id = kwargs.get("session_id")
|
||||
if session_id:
|
||||
@@ -168,13 +177,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
||||
if os.path.exists(cache_file_path):
|
||||
html = ""
|
||||
with open(cache_file_path, "r") as f:
|
||||
return f.read()
|
||||
html = f.read()
|
||||
# retrieve response headers and status code from cache
|
||||
with open(cache_file_path + ".meta", "r") as f:
|
||||
meta = json.load(f)
|
||||
response_headers = meta.get("response_headers", {})
|
||||
status_code = meta.get("status_code")
|
||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
||||
return response
|
||||
|
||||
if not kwargs.get("js_only", False):
|
||||
await self.execute_hook('before_goto', page)
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
await self.execute_hook('after_goto', page)
|
||||
|
||||
# Get status code and headers
|
||||
status_code = response.status
|
||||
response_headers = response.headers
|
||||
|
||||
await page.wait_for_selector('body')
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
@@ -202,8 +223,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
# store response headers and status code in cache
|
||||
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"response_headers": response_headers,
|
||||
"status_code": status_code
|
||||
}, f)
|
||||
|
||||
return html
|
||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
||||
return response
|
||||
except Error as e:
|
||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||
finally:
|
||||
@@ -218,7 +246,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# except Exception as e:
|
||||
# raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from .models import CrawlResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
@@ -101,7 +101,8 @@ class AsyncWebCrawler:
|
||||
t1 = time.time()
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
html = await self.crawler_strategy.crawl(url, **kwargs)
|
||||
async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(
|
||||
@@ -121,8 +122,11 @@ class AsyncWebCrawler:
|
||||
screenshot_data,
|
||||
verbose,
|
||||
bool(cached),
|
||||
async_response=async_response,
|
||||
**kwargs,
|
||||
)
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.responser_headers = async_response.response_headers
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
return crawl_result
|
||||
|
||||
@@ -43,6 +43,10 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
|
||||
for tag in kwargs.get('excluded_tags', []) or []:
|
||||
for el in body.select(tag):
|
||||
el.decompose()
|
||||
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
|
||||
@@ -17,4 +17,6 @@ class CrawlResult(BaseModel):
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
responser_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
@@ -441,6 +441,10 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
|
||||
for tag in kwargs.get('excluded_tags', []) or []:
|
||||
for el in body.select(tag):
|
||||
el.decompose()
|
||||
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
<body>
|
||||
<header class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper</h1>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
2
setup.py
2
setup.py
@@ -26,7 +26,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.77",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/unclecode/crawl4ai",
|
||||
|
||||
Reference in New Issue
Block a user