diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3840260e..7b24620c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -12,6 +12,7 @@ import json, uuid import hashlib from pathlib import Path from playwright.async_api import ProxySettings +from pydantic import BaseModel def calculate_semaphore_count(): cpu_count = os.cpu_count() @@ -20,13 +21,18 @@ def calculate_semaphore_count(): memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance return min(base_count, memory_based_cap) +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + class AsyncCrawlerStrategy(ABC): @abstractmethod - async def crawl(self, url: str, **kwargs) -> str: + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: pass @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[str]: + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: pass @abstractmethod @@ -140,7 +146,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) - async def crawl(self, url: str, **kwargs) -> str: + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + response_headers = {} + status_code = None + self._cleanup_expired_sessions() session_id = kwargs.get("session_id") if session_id: @@ -168,13 +177,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()) if os.path.exists(cache_file_path): + html = "" with open(cache_file_path, "r") as f: - return f.read() + html = f.read() + # retrieve response headers and status code from cache + with open(cache_file_path + ".meta", "r") as f: + meta = json.load(f) + response_headers = meta.get("response_headers", {}) + status_code = meta.get("status_code") + response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + return response if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) - await page.goto(url, wait_until="domcontentloaded", timeout=60000) + response = await page.goto(url, wait_until="domcontentloaded", timeout=60000) await self.execute_hook('after_goto', page) + + # Get status code and headers + status_code = response.status + response_headers = response.headers await page.wait_for_selector('body') await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") @@ -202,8 +223,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) + # store response headers and status code in cache + with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: + json.dump({ + "response_headers": response_headers, + "status_code": status_code + }, f) - return html + response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + return response except Error as e: raise Error(f"Failed to crawl {url}: {str(e)}") finally: @@ -218,7 +246,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # except Exception as e: # raise Exception(f"Failed to crawl {url}: {str(e)}") - async def crawl_many(self, urls: List[str], **kwargs) -> List[str]: + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count()) semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 3cdc9ac1..ceca09f5 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -8,7 +8,7 @@ from .models import CrawlResult from .async_database import async_db_manager from .chunking_strategy import * from .extraction_strategy import * -from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy +from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .content_scrapping_strategy import WebScrappingStrategy from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD from .utils import ( @@ -101,7 +101,8 @@ class AsyncWebCrawler: t1 = time.time() if user_agent: self.crawler_strategy.update_user_agent(user_agent) - html = await self.crawler_strategy.crawl(url, **kwargs) + async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs) + html = sanitize_input_encode(async_response.html) t2 = time.time() if verbose: print( @@ -121,8 +122,11 @@ class AsyncWebCrawler: screenshot_data, verbose, bool(cached), + async_response=async_response, **kwargs, ) + crawl_result.status_code = async_response.status_code + crawl_result.responser_headers = async_response.response_headers crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) return crawl_result diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 56868354..126dbaed 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -43,6 +43,10 @@ class WebScrappingStrategy(ContentScrappingStrategy): image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + for tag in kwargs.get('excluded_tags', []) or []: + for el in body.select(tag): + el.decompose() + if css_selector: selected_elements = body.select(css_selector) if not selected_elements: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index e48441b8..eefb0cb9 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -17,4 +17,6 @@ class CrawlResult(BaseModel): extracted_content: Optional[str] = None metadata: Optional[dict] = None error_message: Optional[str] = None - session_id: Optional[str] = None \ No newline at end of file + session_id: Optional[str] = None + responser_headers: Optional[dict] = None + status_code: Optional[int] = None \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 61d1e333..b5736ad8 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -441,6 +441,10 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + for tag in kwargs.get('excluded_tags', []) or []: + for el in body.select(tag): + el.decompose() + if css_selector: selected_elements = body.select(css_selector) if not selected_elements: diff --git a/pages/index_pooling.html b/pages/index_pooling.html index 920801d1..02128f84 100644 --- a/pages/index_pooling.html +++ b/pages/index_pooling.html @@ -50,7 +50,7 @@