From 152ac35bc2805610863d1f13efe8434fe2d290bd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 21:09:26 +0800 Subject: [PATCH] feat(docs): update README for version 0.3.74 with new features and improvements fix(version): update version number to 0.3.74 refactor(async_webcrawler): enhance logging and add domain-based request delay --- README.md | 16 +++++------ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 4 +-- crawl4ai/async_webcrawler.py | 43 +++++++++++++++++++++++++----- crawl4ai/config.py | 1 + 5 files changed, 47 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 9c3796cd..f6c8dc08 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! -- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object. -- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag. +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. -- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters. +- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. - ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. -- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable. -- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`. -- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions. -- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. - +- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`. +- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance. +- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. ## Try it Now! diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 7ab71c9b..65ee6e73 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.731" \ No newline at end of file +__version__ = "0.3.74" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a67591af..90d5cbe8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -605,7 +605,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, - downloads_path=self.downloads_path if self.accept_downloads else None + # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -905,7 +905,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return response except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") # finally: # if not session_id: # await page.close() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d22e3b1f..79a17ac4 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,15 +15,19 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy + from .config import ( MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + URL_LOG_SHORTEN_LENGTH ) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html ) +from urllib.parse import urlparse +import random from .__version__ import __version__ as crawl4ai_version @@ -51,6 +55,7 @@ class AsyncWebCrawler: To disable deprecation warnings: Pass warning=False to suppress the warning. """ + _domain_last_hit = {} def __init__( self, @@ -248,7 +253,7 @@ class AsyncWebCrawler: screenshot_data = async_response.screenshot t2 = time.perf_counter() if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -283,7 +288,7 @@ class AsyncWebCrawler: crawl_result.session_id = kwargs.get("session_id", None) if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") # Update cache if appropriate @@ -295,7 +300,7 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}") + print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") return CrawlResult( url=url, html="", @@ -350,10 +355,29 @@ class AsyncWebCrawler: if cache_mode is None: cache_mode = CacheMode.BYPASS - semaphore_count = kwargs.get('semaphore_count', 5) + semaphore_count = kwargs.get('semaphore_count', 10) semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): + domain = urlparse(url).netloc + current_time = time.time() + + print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + + # Get delay settings from kwargs or use defaults + mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay + max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay + + # Check if we need to wait + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + # Update last hit time + self._domain_last_hit[domain] = current_time + async with semaphore: return await self.arun( url, @@ -369,8 +393,13 @@ class AsyncWebCrawler: **kwargs, ) + # Print start message + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -423,7 +452,7 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") @@ -439,7 +468,7 @@ class AsyncWebCrawler: extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 6b1324dd..786ca4e5 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -55,4 +55,5 @@ IMAGE_SCORE_THRESHOLD = 2 MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True +URL_LOG_SHORTEN_LENGTH = 30 SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file