feat(docs): update README for version 0.3.74 with new features and improvements
fix(version): update version number to 0.3.74 refactor(async_webcrawler): enhance logging and add domain-based request delay
This commit is contained in:
16
README.md
16
README.md
@@ -13,17 +13,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
|
|||||||
|
|
||||||
## New in 0.3.74 ✨
|
## New in 0.3.74 ✨
|
||||||
|
|
||||||
- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
|
- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
|
||||||
- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object.
|
- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object.
|
||||||
- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag.
|
- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content.
|
||||||
- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
|
- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
|
||||||
- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters.
|
- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures.
|
||||||
- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter.
|
- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter.
|
||||||
- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable.
|
- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`.
|
||||||
- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`.
|
- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance.
|
||||||
- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions.
|
- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing.
|
||||||
- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing.
|
|
||||||
|
|
||||||
|
|
||||||
## Try it Now!
|
## Try it Now!
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.3.731"
|
__version__ = "0.3.74"
|
||||||
@@ -605,7 +605,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
proxy={"server": self.proxy} if self.proxy else None,
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
java_script_enabled=True,
|
java_script_enabled=True,
|
||||||
accept_downloads=self.accept_downloads,
|
accept_downloads=self.accept_downloads,
|
||||||
downloads_path=self.downloads_path if self.accept_downloads else None
|
# downloads_path=self.downloads_path if self.accept_downloads else None
|
||||||
)
|
)
|
||||||
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
@@ -905,7 +905,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}")
|
raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}")
|
||||||
# finally:
|
# finally:
|
||||||
# if not session_id:
|
# if not session_id:
|
||||||
# await page.close()
|
# await page.close()
|
||||||
|
|||||||
@@ -15,15 +15,19 @@ from .extraction_strategy import *
|
|||||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||||
from .content_scrapping_strategy import WebScrapingStrategy
|
from .content_scrapping_strategy import WebScrapingStrategy
|
||||||
|
|
||||||
from .config import (
|
from .config import (
|
||||||
MIN_WORD_THRESHOLD,
|
MIN_WORD_THRESHOLD,
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
|
URL_LOG_SHORTEN_LENGTH
|
||||||
)
|
)
|
||||||
from .utils import (
|
from .utils import (
|
||||||
sanitize_input_encode,
|
sanitize_input_encode,
|
||||||
InvalidCSSSelectorError,
|
InvalidCSSSelectorError,
|
||||||
format_html
|
format_html
|
||||||
)
|
)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import random
|
||||||
from .__version__ import __version__ as crawl4ai_version
|
from .__version__ import __version__ as crawl4ai_version
|
||||||
|
|
||||||
|
|
||||||
@@ -51,6 +55,7 @@ class AsyncWebCrawler:
|
|||||||
To disable deprecation warnings:
|
To disable deprecation warnings:
|
||||||
Pass warning=False to suppress the warning.
|
Pass warning=False to suppress the warning.
|
||||||
"""
|
"""
|
||||||
|
_domain_last_hit = {}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -248,7 +253,7 @@ class AsyncWebCrawler:
|
|||||||
screenshot_data = async_response.screenshot
|
screenshot_data = async_response.screenshot
|
||||||
t2 = time.perf_counter()
|
t2 = time.perf_counter()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
|
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
|
||||||
|
|
||||||
# Process the HTML content
|
# Process the HTML content
|
||||||
crawl_result = await self.aprocess_html(
|
crawl_result = await self.aprocess_html(
|
||||||
@@ -283,7 +288,7 @@ class AsyncWebCrawler:
|
|||||||
crawl_result.session_id = kwargs.get("session_id", None)
|
crawl_result.session_id = kwargs.get("session_id", None)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
|
print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
|
||||||
|
|
||||||
|
|
||||||
# Update cache if appropriate
|
# Update cache if appropriate
|
||||||
@@ -295,7 +300,7 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}")
|
print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
html="",
|
html="",
|
||||||
@@ -350,10 +355,29 @@ class AsyncWebCrawler:
|
|||||||
if cache_mode is None:
|
if cache_mode is None:
|
||||||
cache_mode = CacheMode.BYPASS
|
cache_mode = CacheMode.BYPASS
|
||||||
|
|
||||||
semaphore_count = kwargs.get('semaphore_count', 5)
|
semaphore_count = kwargs.get('semaphore_count', 10)
|
||||||
semaphore = asyncio.Semaphore(semaphore_count)
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|
||||||
async def crawl_with_semaphore(url):
|
async def crawl_with_semaphore(url):
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
|
||||||
|
|
||||||
|
# Get delay settings from kwargs or use defaults
|
||||||
|
mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay
|
||||||
|
max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay
|
||||||
|
|
||||||
|
# Check if we need to wait
|
||||||
|
if domain in self._domain_last_hit:
|
||||||
|
time_since_last = current_time - self._domain_last_hit[domain]
|
||||||
|
if time_since_last < mean_delay:
|
||||||
|
delay = mean_delay + random.uniform(0, max_range)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
# Update last hit time
|
||||||
|
self._domain_last_hit[domain] = current_time
|
||||||
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await self.arun(
|
return await self.arun(
|
||||||
url,
|
url,
|
||||||
@@ -369,8 +393,13 @@ class AsyncWebCrawler:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Print start message
|
||||||
|
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
|
||||||
|
start_time = time.perf_counter()
|
||||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
|
||||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||||
|
|
||||||
|
|
||||||
@@ -423,7 +452,7 @@ class AsyncWebCrawler:
|
|||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
|
print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -439,7 +468,7 @@ class AsyncWebCrawler:
|
|||||||
extracted_content = extraction_strategy.run(url, sections)
|
extracted_content = extraction_strategy.run(url, sections)
|
||||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
|
print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -55,4 +55,5 @@ IMAGE_SCORE_THRESHOLD = 2
|
|||||||
MAX_METRICS_HISTORY = 1000
|
MAX_METRICS_HISTORY = 1000
|
||||||
|
|
||||||
NEED_MIGRATION = True
|
NEED_MIGRATION = True
|
||||||
|
URL_LOG_SHORTEN_LENGTH = 30
|
||||||
SHOW_DEPRECATION_WARNINGS = True
|
SHOW_DEPRECATION_WARNINGS = True
|
||||||
Reference in New Issue
Block a user