From 9247877037395dbf9b2fca67241a134724ec0155 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 20 Jan 2025 22:14:05 +0800 Subject: [PATCH 01/19] feat(proxy): add proxy configuration support to CrawlerRunConfig Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples --- CHANGELOG.md | 4 ++++ crawl4ai/async_configs.py | 6 ++++++ crawl4ai/async_crawler_strategy.py | 16 +++++++++++--- docs/md_v2/advanced/proxy-security.md | 30 ++++++++++++++++++--------- 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bea14df..93bd9bdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### [Added] 2025-01-20 +- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request +- Updated documentation with examples for using proxy configuration in crawl operations + ### [Added] 2025-01-20 - New LLM-powered schema generation utility for JsonElementExtractionStrategy - Support for automatic CSS and XPath schema generation using OpenAI or Ollama diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index f4914726..fbcb6e70 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -270,6 +270,8 @@ class CrawlerRunConfig: Default: "lxml". scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. Default: WebScrapingStrategy. + proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + If None, no additional proxy config. Default: None. # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. @@ -389,6 +391,7 @@ class CrawlerRunConfig: prettiify: bool = False, parser_type: str = "lxml", scraping_strategy: ContentScrapingStrategy = None, + proxy_config: dict = None, # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters @@ -457,6 +460,7 @@ class CrawlerRunConfig: self.prettiify = prettiify self.parser_type = parser_type self.scraping_strategy = scraping_strategy or WebScrapingStrategy() + self.proxy_config = proxy_config # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -553,6 +557,7 @@ class CrawlerRunConfig: prettiify=kwargs.get("prettiify", False), parser_type=kwargs.get("parser_type", "lxml"), scraping_strategy=kwargs.get("scraping_strategy"), + proxy_config=kwargs.get("proxy_config"), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters @@ -631,6 +636,7 @@ class CrawlerRunConfig: "prettiify": self.prettiify, "parser_type": self.parser_type, "scraping_strategy": self.scraping_strategy, + "proxy_config": self.proxy_config, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 786d2fb9..ae1788f1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -543,9 +543,9 @@ class BrowserManager: or crawlerRunConfig.simulate_user or crawlerRunConfig.magic ): - await context.add_init_script(load_js_script("navigator_overrider")) + await context.add_init_script(load_js_script("navigator_overrider")) - async def create_browser_context(self): + async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): """ Creates and returns a new browser context with configured settings. Applies text-only mode settings if text_mode is enabled in config. @@ -627,6 +627,16 @@ class BrowserManager: "device_scale_factor": 1.0, "java_script_enabled": self.config.java_script_enabled, } + + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.get("server"), + "username": crawlerRunConfig.proxy_config.get("username"), + "password": crawlerRunConfig.proxy_config.get("password"), + } + context_settings["proxy"] = proxy_settings if self.config.text_mode: text_mode_settings = { @@ -710,7 +720,7 @@ class BrowserManager: context = self.contexts_by_config[config_signature] else: # Create and setup a new context - context = await self.create_browser_context() + context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index b98c17e5..9b64fd84 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` -## Rotating Proxies +Here's the corrected documentation: -Example using a proxy rotation service and updating `BrowserConfig` dynamically: +## Rotating Proxies [COMING SOON] + +Example using a proxy rotation service dynamically: ```python -from crawl4ai.async_configs import BrowserConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig async def get_next_proxy(): # Your proxy rotation logic here return {"server": "http://next.proxy.com:8080"} -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - # Update proxy for each request - for url in urls: - proxy = await get_next_proxy() - browser_config.proxy_config = proxy - result = await crawler.arun(url=url, config=browser_config) +async def main(): + browser_config = BrowserConfig() + run_config = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_config) as crawler: + # For each URL, create a new run config with different proxy + for url in urls: + proxy = await get_next_proxy() + # Clone the config and update proxy - this creates a new browser context + current_config = run_config.clone(proxy_config=proxy) + result = await crawler.arun(url=url, config=current_config) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) ``` From d09c611d152717bc0801b65cc45efeebff2e4399 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 21 Jan 2025 17:54:13 +0800 Subject: [PATCH 02/19] feat(robots): add robots.txt compliance support Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days. --- .gitignore | 5 +- CHANGELOG.md | 6 + crawl4ai/async_configs.py | 7 + crawl4ai/async_webcrawler.py | 16 +++ crawl4ai/utils.py | 132 ++++++++++++++++++ docs/md_v2/advanced/advanced-features.md | 38 ++++++ docs/md_v2/advanced/multi-url-crawling.md | 38 ++++++ docs/md_v2/api/arun.md | 8 +- docs/md_v2/api/parameters.md | 23 ++-- tests/20241401/test_robot_parser.py | 159 ++++++++++++++++++++++ tests/20241401/tets_robot.py | 62 +++++++++ 11 files changed, 482 insertions(+), 12 deletions(-) create mode 100644 tests/20241401/test_robot_parser.py create mode 100644 tests/20241401/tets_robot.py diff --git a/.gitignore b/.gitignore index 4f469aa6..302892e4 100644 --- a/.gitignore +++ b/.gitignore @@ -227,4 +227,7 @@ tree.md .do /plans .codeiumignore -todo/ \ No newline at end of file +todo/ + +# windsurf rules +.windsurfrules diff --git a/CHANGELOG.md b/CHANGELOG.md index 93bd9bdc..6c790f02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### [Added] 2025-01-21 +- Added robots.txt compliance support with efficient SQLite-based caching +- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking +- Documentation updates for robots.txt compliance features and examples +- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs + ### [Added] 2025-01-20 - Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request - Updated documentation with examples for using proxy configuration in crawl operations diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index fbcb6e70..b0813abe 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -372,6 +372,7 @@ class CrawlerRunConfig: # Optional Parameters stream (bool): If True, stream the page content as it is being loaded. url: str = None # This is not a compulsory parameter + check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False """ def __init__( @@ -442,6 +443,7 @@ class CrawlerRunConfig: # Streaming Parameters stream: bool = False, url: str = None, + check_robots_txt: bool = False, ): self.url = url @@ -521,6 +523,9 @@ class CrawlerRunConfig: # Streaming Parameters self.stream = stream + # Robots.txt Handling Parameters + self.check_robots_txt = check_robots_txt + # Validate type of extraction strategy and chunking strategy if they are provided if self.extraction_strategy is not None and not isinstance( self.extraction_strategy, ExtractionStrategy @@ -617,6 +622,7 @@ class CrawlerRunConfig: # Streaming Parameters stream=kwargs.get("stream", False), url=kwargs.get("url"), + check_robots_txt=kwargs.get("check_robots_txt", False), ) # Create a funciton returns dict of the object @@ -679,6 +685,7 @@ class CrawlerRunConfig: "log_console": self.log_console, "stream": self.stream, "url": self.url, + "check_robots_txt": self.check_robots_txt, } def clone(self, **kwargs): diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 61cfc18f..dc7e2cb9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -40,6 +40,7 @@ from .utils import ( fast_format_html, create_box_message, get_error_context, + RobotsParser, ) from typing import Union, AsyncGenerator, List, TypeVar @@ -203,6 +204,9 @@ class AsyncWebCrawler: os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + # Initialize robots parser + self.robots_parser = RobotsParser() + self.ready = False async def start(self): @@ -414,6 +418,18 @@ class AsyncWebCrawler: if user_agent: self.crawler_strategy.update_user_agent(user_agent) + # Check robots.txt if enabled + if config and config.check_robots_txt: + if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): + return CrawlResult( + url=url, + html="", + success=False, + status_code=403, + error_message="Access denied by robots.txt", + response_headers={"X-Robots-Status": "Blocked by robots.txt"} + ) + # Pass config to crawl method async_response = await self.crawler_strategy.crawl( url, diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index ea1309a8..2e9e3ff8 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -23,6 +23,138 @@ import pstats from functools import wraps import asyncio +import sqlite3 +import hashlib +from urllib.parse import urljoin, urlparse +from urllib.robotparser import RobotFileParser +import aiohttp + +class RobotsParser: + # Default 7 days cache TTL + CACHE_TTL = 7 * 24 * 60 * 60 + + def __init__(self, cache_dir=None, cache_ttl=None): + self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots") + self.cache_ttl = cache_ttl or self.CACHE_TTL + os.makedirs(self.cache_dir, exist_ok=True) + self.db_path = os.path.join(self.cache_dir, "robots_cache.db") + self._init_db() + + def _init_db(self): + # Use WAL mode for better concurrency and performance + with sqlite3.connect(self.db_path) as conn: + conn.execute("PRAGMA journal_mode=WAL") + conn.execute(""" + CREATE TABLE IF NOT EXISTS robots_cache ( + domain TEXT PRIMARY KEY, + rules TEXT NOT NULL, + fetch_time INTEGER NOT NULL, + hash TEXT NOT NULL + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)") + + def _get_cached_rules(self, domain: str) -> tuple[str, bool]: + """Get cached rules. Returns (rules, is_fresh)""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute( + "SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?", + (domain,) + ) + result = cursor.fetchone() + + if not result: + return None, False + + rules, fetch_time, _ = result + # Check if cache is still fresh based on TTL + return rules, (time.time() - fetch_time) < self.cache_ttl + + def _cache_rules(self, domain: str, content: str): + """Cache robots.txt content with hash for change detection""" + hash_val = hashlib.md5(content.encode()).hexdigest() + with sqlite3.connect(self.db_path) as conn: + # Check if content actually changed + cursor = conn.execute( + "SELECT hash FROM robots_cache WHERE domain = ?", + (domain,) + ) + result = cursor.fetchone() + + # Only update if hash changed or no previous entry + if not result or result[0] != hash_val: + conn.execute( + """INSERT OR REPLACE INTO robots_cache + (domain, rules, fetch_time, hash) + VALUES (?, ?, ?, ?)""", + (domain, content, int(time.time()), hash_val) + ) + + async def can_fetch(self, url: str, user_agent: str = "*") -> bool: + """ + Check if URL can be fetched according to robots.txt rules. + + Args: + url: The URL to check + user_agent: User agent string to check against (default: "*") + + Returns: + bool: True if allowed, False if disallowed by robots.txt + """ + # Handle empty/invalid URLs + try: + parsed = urlparse(url) + domain = parsed.netloc + if not domain: + return True + except: + return True + + # Fast path - check cache first + rules, is_fresh = self._get_cached_rules(domain) + + # If rules not found or stale, fetch new ones + if not is_fresh: + try: + # Ensure we use the same scheme as the input URL + scheme = parsed.scheme or 'http' + robots_url = f"{scheme}://{domain}/robots.txt" + + async with aiohttp.ClientSession() as session: + async with session.get(robots_url, timeout=2) as response: + if response.status == 200: + rules = await response.text() + self._cache_rules(domain, rules) + else: + return True + except: + # On any error (timeout, connection failed, etc), allow access + return True + + if not rules: + return True + + # Create parser for this check + parser = RobotFileParser() + parser.parse(rules.splitlines()) + + # If parser can't read rules, allow access + if not parser.mtime(): + return True + + return parser.can_fetch(user_agent, url) + + def clear_cache(self): + """Clear all cached robots.txt entries""" + with sqlite3.connect(self.db_path) as conn: + conn.execute("DELETE FROM robots_cache") + + def clear_expired(self): + """Remove only expired entries from cache""" + with sqlite3.connect(self.db_path) as conn: + expire_time = int(time.time()) - self.cache_ttl + conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,)) + class InvalidCSSSelectorError(Exception): pass diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md index 1f402948..6b3776d1 100644 --- a/docs/md_v2/advanced/advanced-features.md +++ b/docs/md_v2/advanced/advanced-features.md @@ -8,6 +8,7 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi 3. **Handling SSL Certificates** 4. **Custom Headers** 5. **Session Persistence & Local Storage** +6. **Robots.txt Compliance** > **Prerequisites** > - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md) @@ -251,6 +252,42 @@ You can sign in once, export the browser context, and reuse it later—without r --- +## 6. Robots.txt Compliance + +Crawl4AI supports respecting robots.txt rules with efficient caching: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + # Enable robots.txt checking in config + config = CrawlerRunConfig( + check_robots_txt=True # Will check and respect robots.txt rules + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://example.com", + config=config + ) + + if not result.success and result.status_code == 403: + print("Access denied by robots.txt") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- Robots.txt files are cached locally for efficiency +- Cache is stored in `~/.crawl4ai/robots/robots_cache.db` +- Cache has a default TTL of 7 days +- If robots.txt can't be fetched, crawling is allowed +- Returns 403 status code if URL is disallowed + +--- + ## Putting It All Together Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs. @@ -321,6 +358,7 @@ You’ve now explored several **advanced** features: - **SSL Certificate** retrieval & exporting - **Custom Headers** for language or specialized requests - **Session Persistence** via storage state +- **Robots.txt Compliance** With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline. diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index cae789a2..d9b04535 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -189,6 +189,44 @@ async def crawl_with_semaphore(urls): return results ``` +### 4.4 Robots.txt Consideration + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + urls = [ + "https://example1.com", + "https://example2.com", + "https://example3.com" + ] + + config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + check_robots_txt=True, # Will respect robots.txt for each URL + semaphore_count=3 # Max concurrent requests + ) + + async with AsyncWebCrawler() as crawler: + async for result in crawler.arun_many(urls, config=config): + if result.success: + print(f"Successfully crawled {result.url}") + elif result.status_code == 403 and "robots.txt" in result.error_message: + print(f"Skipped {result.url} - blocked by robots.txt") + else: + print(f"Failed to crawl {result.url}: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points**: +- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling +- Robots.txt files are cached for efficiency +- Failed robots.txt checks return 403 status code +- Dispatcher handles robots.txt checks automatically for each URL + ## 5. Dispatch Results Each crawl result includes dispatch information: diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index 5972f402..b951b9a5 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -22,6 +22,7 @@ async def main(): run_config = CrawlerRunConfig( verbose=True, # Detailed logging cache_mode=CacheMode.ENABLED, # Use normal read/write cache + check_robots_txt=True, # Respect robots.txt rules # ... other parameters ) @@ -30,8 +31,10 @@ async def main(): url="https://example.com", config=run_config ) - print(result.cleaned_html[:500]) - + + # Check if blocked by robots.txt + if not result.success and result.status_code == 403: + print(f"Error: {result.error_message}") ``` **Key Fields**: @@ -226,6 +229,7 @@ async def main(): # Core verbose=True, cache_mode=CacheMode.ENABLED, + check_robots_txt=True, # Respect robots.txt rules # Content word_count_threshold=10, diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 3ff5bb53..932a2642 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -106,6 +106,7 @@ Use these for controlling whether you read or write from a local content cache. | **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. | | **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | | **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | +| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. | | **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. | | **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. | @@ -266,17 +267,21 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) + +## 2.4 Compliance & Ethics + +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| +| **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. | +| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. | + +```python +run_config = CrawlerRunConfig( + check_robots_txt=True, # Enable robots.txt compliance + user_agent="MyBot/1.0" # Identify your crawler +) ``` -**What’s Happening**: -- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl. -- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content. -- We only keep `main.article` content by specifying `css_selector="main.article"`. -- We exclude external links (`exclude_external_links=True`). -- We do a quick screenshot (`screenshot=True`) before finishing. - ---- - ## 3. Putting It All Together - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. diff --git a/tests/20241401/test_robot_parser.py b/tests/20241401/test_robot_parser.py new file mode 100644 index 00000000..a2fc30f1 --- /dev/null +++ b/tests/20241401/test_robot_parser.py @@ -0,0 +1,159 @@ +from crawl4ai.utils import RobotsParser + +import asyncio +import aiohttp +from aiohttp import web +import tempfile +import shutil +import os, sys, time, json + + +async def test_robots_parser(): + print("\n=== Testing RobotsParser ===\n") + + # Setup temporary directory for testing + temp_dir = tempfile.mkdtemp() + try: + # 1. Basic setup test + print("1. Testing basic initialization...") + parser = RobotsParser(cache_dir=temp_dir) + assert os.path.exists(parser.db_path), "Database file not created" + print("✓ Basic initialization passed") + + # 2. Test common cases + print("\n2. Testing common cases...") + allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0") + print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}") + + # Test caching + print("Testing cache...") + start = time.time() + await parser.can_fetch("https://www.example.com", "MyBot/1.0") + duration = time.time() - start + print(f"✓ Cached lookup took: {duration*1000:.2f}ms") + assert duration < 0.03, "Cache lookup too slow" + + # 3. Edge cases + print("\n3. Testing edge cases...") + + # Empty URL + result = await parser.can_fetch("", "MyBot/1.0") + print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}") + + # Invalid URL + result = await parser.can_fetch("not_a_url", "MyBot/1.0") + print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}") + + # URL without scheme + result = await parser.can_fetch("example.com/page", "MyBot/1.0") + print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}") + + # 4. Test with local server + async def start_test_server(): + app = web.Application() + + async def robots_txt(request): + return web.Response(text="""User-agent: * +Disallow: /private/ +Allow: /public/ +""") + + async def malformed_robots(request): + return web.Response(text="<<>>") + + async def timeout_robots(request): + await asyncio.sleep(5) + return web.Response(text="Should timeout") + + async def empty_robots(request): + return web.Response(text="") + + async def giant_robots(request): + return web.Response(text="User-agent: *\nDisallow: /\n" * 10000) + + # Mount all handlers at root level + app.router.add_get('/robots.txt', robots_txt) + app.router.add_get('/malformed/robots.txt', malformed_robots) + app.router.add_get('/timeout/robots.txt', timeout_robots) + app.router.add_get('/empty/robots.txt', empty_robots) + app.router.add_get('/giant/robots.txt', giant_robots) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8080) + await site.start() + return runner + + runner = await start_test_server() + try: + print("\n4. Testing robots.txt rules...") + base_url = "http://localhost:8080" + + # Test public access + result = await parser.can_fetch(f"{base_url}/public/page", "bot") + print(f"Public access (/public/page): {'allowed' if result else 'denied'}") + assert result, "Public path should be allowed" + + # Test private access + result = await parser.can_fetch(f"{base_url}/private/secret", "bot") + print(f"Private access (/private/secret): {'allowed' if result else 'denied'}") + assert not result, "Private path should be denied" + + # Test malformed + result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot") + print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}") + + # Test timeout + start = time.time() + result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot") + duration = time.time() - start + print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}") + assert duration < 3, "Timeout not working" + + # Test empty + result = await parser.can_fetch("http://localhost:8080/empty/page", "bot") + print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}") + + # Test giant file + start = time.time() + result = await parser.can_fetch("http://localhost:8080/giant/page", "bot") + duration = time.time() - start + print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}") + + finally: + await runner.cleanup() + + # 5. Cache manipulation + print("\n5. Testing cache manipulation...") + + # Clear expired + parser.clear_expired() + print("✓ Clear expired entries completed") + + # Clear all + parser.clear_cache() + print("✓ Clear all cache completed") + + # Test with custom TTL + custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1) # 1 second TTL + await custom_parser.can_fetch("https://www.example.com", "bot") + print("✓ Custom TTL fetch completed") + await asyncio.sleep(1.1) + start = time.time() + await custom_parser.can_fetch("https://www.example.com", "bot") + print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)") + + finally: + # Cleanup + shutil.rmtree(temp_dir) + print("\nTest cleanup completed") + +async def main(): + try: + await test_robots_parser() + except Exception as e: + print(f"Test failed: {str(e)}") + raise + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/20241401/tets_robot.py b/tests/20241401/tets_robot.py new file mode 100644 index 00000000..9bb30bb9 --- /dev/null +++ b/tests/20241401/tets_robot.py @@ -0,0 +1,62 @@ +import asyncio +from crawl4ai import * + +async def test_real_websites(): + print("\n=== Testing Real Website Robots.txt Compliance ===\n") + + browser_config = BrowserConfig(headless=True, verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + + # Test cases with URLs + test_cases = [ + # Public sites that should be allowed + ("https://example.com", True), # Simple public site + ("https://httpbin.org/get", True), # API endpoint + + # Sites with known strict robots.txt + ("https://www.facebook.com/robots.txt", False), # Social media + ("https://www.google.com/search", False), # Search pages + + # Edge cases + ("https://api.github.com", True), # API service + ("https://raw.githubusercontent.com", True), # Content delivery + + # Non-existent/error cases + ("https://thisisnotarealwebsite.com", True), # Non-existent domain + ("https://localhost:12345", True), # Invalid port + ] + + for url, expected in test_cases: + print(f"\nTesting: {url}") + try: + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + check_robots_txt=True, # Enable robots.txt checking + verbose=True + ) + + result = await crawler.arun(url=url, config=config) + allowed = result.success and not result.error_message + + print(f"Expected: {'allowed' if expected else 'denied'}") + print(f"Actual: {'allowed' if allowed else 'denied'}") + print(f"Status Code: {result.status_code}") + if result.error_message: + print(f"Error: {result.error_message}") + + # Optional: Print robots.txt content if available + if result.metadata and 'robots_txt' in result.metadata: + print(f"Robots.txt rules:\n{result.metadata['robots_txt']}") + + except Exception as e: + print(f"Test failed with error: {str(e)}") + +async def main(): + try: + await test_real_websites() + except Exception as e: + print(f"Test suite failed: {str(e)}") + raise + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 16b8d4945b831dc06130bd2b1c698f33c4c31d01 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 21 Jan 2025 21:03:11 +0800 Subject: [PATCH 03/19] feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit --- CHANGELOG.md | 150 ++++++++-- crawl4ai/__init__.py | 5 +- crawl4ai/__version__.py | 2 +- docs/examples/dispatcher_example.py | 3 +- docs/examples/llm_markdown_generator.py | 87 ++++++ .../scraping_strategies_performance.py | 135 +++++++++ docs/examples/v0_4_3_features_demo.py | 252 +++++++++++++++++ .../md_v2/advanced/multi-url-crawling copy.md | 264 ----------------- docs/md_v2/advanced/multi-url-crawling.md | 4 +- docs/md_v2/blog/releases/v0.4.3b1.md | 266 ++++++++++++++++++ docs/md_v2/core/markdown-generation.md | 2 +- pyproject.toml | 2 +- 12 files changed, 885 insertions(+), 287 deletions(-) create mode 100644 docs/examples/llm_markdown_generator.py create mode 100644 docs/examples/scraping_strategies_performance.py create mode 100644 docs/examples/v0_4_3_features_demo.py delete mode 100644 docs/md_v2/advanced/multi-url-crawling copy.md create mode 100644 docs/md_v2/blog/releases/v0.4.3b1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c790f02..d62d8775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,3 @@ -### [Added] 2025-01-21 -- Added robots.txt compliance support with efficient SQLite-based caching -- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking -- Documentation updates for robots.txt compliance features and examples -- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs - -### [Added] 2025-01-20 -- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request -- Updated documentation with examples for using proxy configuration in crawl operations - -### [Added] 2025-01-20 -- New LLM-powered schema generation utility for JsonElementExtractionStrategy -- Support for automatic CSS and XPath schema generation using OpenAI or Ollama -- Comprehensive documentation and examples for schema generation -- New prompt templates optimized for HTML schema analysis - # Changelog All notable changes to Crawl4AI will be documented in this file. @@ -21,6 +5,140 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +Okay, here's a detailed changelog in Markdown format, generated from the provided git diff and commit history. I've focused on user-facing changes, fixes, and features, and grouped them as requested: + +## Version 0.4.3 (2025-01-21) + +This release introduces several powerful new features, including robots.txt compliance, dynamic proxy support, LLM-powered schema generation, and improved documentation. + +### Features + +- **Robots.txt Compliance:** + - Added robots.txt compliance support with efficient SQLite-based caching. + - New `check_robots_txt` parameter in `CrawlerRunConfig` to enable robots.txt checking before crawling a URL. + - Automated robots.txt checking is now integrated into `AsyncWebCrawler` with 403 status codes for blocked URLs. + +- **Proxy Configuration:** + - Added proxy configuration support to `CrawlerRunConfig`, allowing dynamic proxy settings per crawl request. + - Updated documentation with examples for using proxy configuration in crawl operations. + +- **LLM-Powered Schema Generation:** + - Introduced a new utility for automatic CSS and XPath schema generation using OpenAI or Ollama models. + - Added comprehensive documentation and examples for schema generation. + - New prompt templates optimized for HTML schema analysis. + +- **URL Redirection Tracking:** + - Added URL redirection tracking to capture the final URL after any redirects. + - The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object. + +- **Enhanced Streamlined Documentation:** + - Refactored and improved the documentation structure for clarity and ease of use. + - Added detailed explanations of new features and updated examples. + +- **Improved Browser Context Management:** + - Enhanced the management of browser contexts and added shared data support. + - Introduced the `shared_data` parameter in `CrawlerRunConfig` to pass data between hooks. + +- **Memory Dispatcher System:** + - Migrated to a memory dispatcher system with enhanced monitoring capabilities. + - Introduced `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher` for improved resource management. + - Added `RateLimiter` for rate limiting support. + - New `CrawlerMonitor` for real-time monitoring of crawler operations. + +- **Streaming Support:** + - Added streaming support for processing crawled URLs as they are processed. + - Enabled streaming mode with the `stream` parameter in `CrawlerRunConfig`. + +- **Content Scraping Strategy:** + - Introduced a new `LXMLWebScrapingStrategy` for faster content scraping. + - Added support for selecting the scraping strategy via the `scraping_strategy` parameter in `CrawlerRunConfig`. + +### Bug Fixes + +- **Browser Path Management:** + - Improved browser path management for consistent behavior across different environments. + +- **Memory Threshold:** + - Adjusted the default memory threshold to improve resource utilization. + +- **Pydantic Model Fields:** + - Made several model fields optional with default values to improve flexibility. + +### Refactor + +- **Documentation Structure:** + - Reorganized documentation structure to improve navigation and readability. + - Updated styles and added new sections for advanced features. + +- **Scraping Mode:** + - Replaced the `ScrapingMode` enum with a strategy pattern for more flexible content scraping. + +- **Version Update:** + - Updated the version to `0.4.248`. + +- **Code Cleanup:** + - Removed unused files and improved type hints. + - Applied Ruff corrections for code quality. + +- **Updated dependencies:** + - Updated dependencies to their latest versions to ensure compatibility and security. + +- **Ignored certain patterns and directories:** + - Updated `.gitignore` and `.codeiumignore` to ignore additional patterns and directories, streamlining the development environment. + +- **Simplified Personal Story in README:** + - Streamlined the personal story and project vision in the `README.md` for clarity. + +- **Removed Deprecated Files:** + - Deleted several deprecated files and examples that are no longer relevant. + +--- +**Previous Releases:** + +### 0.4.24x (2024-12-31) +- **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling. +- **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies. +- **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction. +- **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types. +- **Performance Boost**: Optimized caching, parallel processing, and memory management. +- **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking. +- **Security Features**: Improved input validation and safe expression evaluation. + +### 0.4.247 (2025-01-06) + +#### Added +- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) +- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +#### Changed +- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py)) +- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) +- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py)) +- **Documentation Update**: + - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) + +#### Removed +- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py)) + +#### Fixed +- **Page Closing to Prevent Memory Leaks**: + - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided. + - **Impact**: Prevents memory leaks caused by lingering pages after a crawl. + - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py) + - **Code**: + ```python + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + ``` +- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py)) +- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +#### Other +- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore)) + + ## [0.4.24] - 2024-12-31 ### Added diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index beda64f8..482afdd7 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -12,10 +12,11 @@ from .extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy, + JsonXPathExtractionStrategy ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -39,11 +40,13 @@ __all__ = [ "LLMExtractionStrategy", "CosineStrategy", "JsonCssExtractionStrategy", + "JsonXPathExtractionStrategy", "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", "PruningContentFilter", "BM25ContentFilter", + "LLMContentFilter", "BaseDispatcher", "MemoryAdaptiveDispatcher", "SemaphoreDispatcher", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ea8194f4..5d2b86af 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.248" +__version__ = "0.4.3b1" diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py index c9708ccc..ae6406bb 100644 --- a/docs/examples/dispatcher_example.py +++ b/docs/examples/dispatcher_example.py @@ -12,6 +12,7 @@ from crawl4ai import ( CrawlerMonitor, DisplayMode, CacheMode, + LXMLWebScrapingStrategy, ) @@ -113,7 +114,7 @@ def create_performance_table(results): async def main(): urls = [f"https://example.com/page{i}" for i in range(1, 20)] browser_config = BrowserConfig(headless=True, verbose=False) - run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy()) results = { "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config), diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py new file mode 100644 index 00000000..60b8549d --- /dev/null +++ b/docs/examples/llm_markdown_generator.py @@ -0,0 +1,87 @@ +import os +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.content_filter_strategy import LLMContentFilter + +async def test_llm_filter(): + # Create an HTML source that needs intelligent filtering + url = "https://docs.python.org/3/tutorial/classes.html" + + browser_config = BrowserConfig( + headless=True, + verbose=True + ) + + # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # First get the raw HTML + result = await crawler.arun(url, config=run_config) + html = result.cleaned_html + + # Initialize LLM filter with focused instruction + filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction=""" + Focus on extracting the core educational content about Python classes. + Include: + - Key concepts and their explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + - Version information + - Any non-essential UI elements + + Format the output as clean markdown with proper code blocks and headers. + """, + verbose=True + ) + + filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 + instruction=""" + Extract the main educational content while preserving its original wording and substance completely. Your task is to: + + 1. Maintain the exact language and terminology used in the main content + 2. Keep all technical explanations, examples, and educational content intact + 3. Preserve the original flow and structure of the core content + 4. Remove only clearly irrelevant elements like: + - Navigation menus + - Advertisement sections + - Cookie notices + - Footers with site information + - Sidebars with external links + - Any UI elements that don't contribute to learning + + The goal is to create a clean markdown version that reads exactly like the original article, + keeping all valuable content but free from distracting elements. Imagine you're creating + a perfect reading experience where nothing valuable is lost, but all noise is removed. + """, + verbose=True + ) + + # Apply filtering + filtered_content = filter.filter_content(html, ignore_cache = True) + + # Show results + print("\nFiltered Content Length:", len(filtered_content)) + print("\nFirst 500 chars of filtered content:") + if filtered_content: + print(filtered_content[0][:500]) + + # Save on disc the markdown version + with open("filtered_content.md", "w", encoding="utf-8") as f: + f.write("\n".join(filtered_content)) + + # Show token usage + filter.show_usage() + +if __name__ == "__main__": + asyncio.run(test_llm_filter()) \ No newline at end of file diff --git a/docs/examples/scraping_strategies_performance.py b/docs/examples/scraping_strategies_performance.py new file mode 100644 index 00000000..b8c80be2 --- /dev/null +++ b/docs/examples/scraping_strategies_performance.py @@ -0,0 +1,135 @@ +import time, re +from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy +import time +import functools +from collections import defaultdict + +class TimingStats: + def __init__(self): + self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0})) + + def add(self, strategy_name, func_name, elapsed): + self.stats[strategy_name][func_name]["calls"] += 1 + self.stats[strategy_name][func_name]["total_time"] += elapsed + + def report(self): + for strategy_name, funcs in self.stats.items(): + print(f"\n{strategy_name} Timing Breakdown:") + print("-" * 60) + print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}") + print("-" * 60) + + for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True): + avg_ms = (data["total_time"] / data["calls"]) * 1000 + print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}") + +timing_stats = TimingStats() + +# Modify timing decorator +def timing_decorator(strategy_name): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + start = time.time() + result = func(*args, **kwargs) + elapsed = time.time() - start + timing_stats.add(strategy_name, func.__name__, elapsed) + return result + return wrapper + return decorator + +# Modified decorator application +def apply_decorators(cls, method_name, strategy_name): + try: + original_method = getattr(cls, method_name) + decorated_method = timing_decorator(strategy_name)(original_method) + setattr(cls, method_name, decorated_method) + except AttributeError: + print(f"Method {method_name} not found in class {cls.__name__}.") + +# Apply to key methods +methods_to_profile = [ + '_scrap', + # 'process_element', + '_process_element', + 'process_image', +] + + +# Apply decorators to both strategies +for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]: + for method in methods_to_profile: + apply_decorators(strategy, method, name) + + +def generate_large_html(n_elements=1000): + html = [''] + for i in range(n_elements): + html.append(f''' +
+

Heading {i}

+
+
+

This is paragraph {i} with some content and a link

+
+
+ Image {i} +
    +
  • List item {i}.1
  • +
  • List item {i}.2
  • +
+
+ ''') + html.append('') + return ''.join(html) + +def test_scraping(): + # Initialize both scrapers + original_scraper = WebScrapingStrategy() + selected_scraper = LXMLWebScrapingStrategy() + + # Generate test HTML + print("Generating HTML...") + html = generate_large_html(5000) + print(f"HTML Size: {len(html)/1024:.2f} KB") + + # Time the scraping + print("\nStarting scrape...") + start_time = time.time() + + kwargs = { + "url": "http://example.com", + "html": html, + "word_count_threshold": 5, + "keep_data_attributes": True + } + + t1 = time.perf_counter() + result_selected = selected_scraper.scrap(**kwargs) + t2 = time.perf_counter() + + result_original = original_scraper.scrap(**kwargs) + t3 = time.perf_counter() + + elapsed = t3 - start_time + print(f"\nScraping completed in {elapsed:.2f} seconds") + + timing_stats.report() + + # Print stats of LXML output + print("\nLXML Output:") + print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}") + print(f"Extracted images: {len(result_selected['media']['images'])}") + print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB") + print(f"Scraping time: {t2 - t1:.2f} seconds") + + # Print stats of original output + print("\nOriginal Output:") + print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}") + print(f"Extracted images: {len(result_original['media']['images'])}") + print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB") + print(f"Scraping time: {t3 - t1:.2f} seconds") + + +if __name__ == "__main__": + test_scraping() \ No newline at end of file diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py new file mode 100644 index 00000000..2ffaa172 --- /dev/null +++ b/docs/examples/v0_4_3_features_demo.py @@ -0,0 +1,252 @@ +""" +Crawl4ai v0.4.3 Features Demo +============================ + +This example demonstrates the major new features introduced in Crawl4ai v0.4.3. +Each section showcases a specific feature with practical examples and explanations. +""" + +import asyncio +import os +from crawl4ai import * + + +async def demo_memory_dispatcher(): + """ + 1. Memory Dispatcher System Demo + =============================== + Shows how to use the new memory dispatcher with monitoring + """ + print("\n=== 1. Memory Dispatcher System Demo ===") + + # Configure crawler + browser_config = BrowserConfig(headless=True, verbose=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator() + ) + + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Initialize dispatcher with monitoring + monitor = CrawlerMonitor( + max_visible_rows=10, + display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, # Memory usage threshold + check_interval=0.5, # How often to check memory + max_session_permit=5, # Max concurrent crawls + monitor=monitor, # Pass the monitor + ) + + # Run with memory monitoring + print("Starting batch crawl with memory monitoring...") + results = await dispatcher.run_urls( + urls=urls, + crawler=crawler, + config=crawler_config, + ) + print(f"Completed {len(results)} URLs") + + +async def demo_streaming_support(): + """ + 2. Streaming Support Demo + ====================== + Shows how to process URLs as they complete using streaming + """ + print("\n=== 2. Streaming Support Demo ===") + + browser_config = BrowserConfig(headless=True, verbose=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True) + + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 2 + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Initialize dispatcher for streaming + dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) + + print("Starting streaming crawl...") + async for result in dispatcher.run_urls_stream( + urls=urls, crawler=crawler, config=crawler_config + ): + # Process each result as it arrives + print( + f"Received result for {result.url} - Success: {result.result.success}" + ) + if result.result.success: + print(f"Content length: {len(result.result.markdown)}") + + +async def demo_content_scraping(): + """ + 3. Content Scraping Strategy Demo + ============================== + Demonstrates the new LXMLWebScrapingStrategy for faster content scraping. + """ + print("\n=== 3. Content Scraping Strategy Demo ===") + + crawler = AsyncWebCrawler() + url = "https://example.com/article" + + # Configure with the new LXML strategy + config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True) + + print("Scraping content with LXML strategy...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success: + print("Successfully scraped content using LXML strategy") + + +async def demo_llm_markdown(): + """ + 4. LLM-Powered Markdown Generation Demo + =================================== + Shows how to use the new LLM-powered content filtering and markdown generation. + """ + print("\n=== 4. LLM-Powered Markdown Generation Demo ===") + + crawler = AsyncWebCrawler() + url = "https://docs.python.org/3/tutorial/classes.html" + + content_filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + instruction=""" + Focus on extracting the core educational content about Python classes. + Include: + - Key concepts and their explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + - Version information + - Any non-essential UI elements + + Format the output as clean markdown with proper code blocks and headers. + """, + verbose=True, + ) + + # Configure LLM-powered markdown generation + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=content_filter + ), + cache_mode = CacheMode.BYPASS, + verbose=True + ) + + print("Generating focused markdown with LLM...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success and result.markdown_v2: + print("Successfully generated LLM-filtered markdown") + print("First 500 chars of filtered content:") + print(result.markdown_v2.fit_markdown[:500]) + print("Successfully generated LLM-filtered markdown") + + +async def demo_robots_compliance(): + """ + 5. Robots.txt Compliance Demo + ========================== + Demonstrates the new robots.txt compliance feature with SQLite caching. + """ + print("\n=== 5. Robots.txt Compliance Demo ===") + + crawler = AsyncWebCrawler() + urls = ["https://example.com", "https://facebook.com", "https://twitter.com"] + + # Enable robots.txt checking + config = CrawlerRunConfig(check_robots_txt=True, verbose=True) + + print("Crawling with robots.txt compliance...") + async with crawler: + results = await crawler.arun_many(urls, config=config) + for result in results: + if result.status_code == 403: + print(f"Access blocked by robots.txt: {result.url}") + elif result.success: + print(f"Successfully crawled: {result.url}") + + + +async def demo_llm_schema_generation(): + """ + 7. LLM-Powered Schema Generation Demo + ================================= + Demonstrates automatic CSS and XPath schema generation using LLM models. + """ + print("\n=== 7. LLM-Powered Schema Generation Demo ===") + + # Example HTML content for a job listing + html_content = """ +
+

Senior Software Engineer

+
+ San Francisco, CA + $150,000 - $200,000 +
+

Requirements

+
    +
  • 5+ years Python experience
  • +
  • Strong background in web crawling
  • +
+
+
+
+ """ + + print("Generating CSS selectors schema...") + # Generate CSS selectors with a specific query + css_schema = JsonCssExtractionStrategy.generate_schema( + html_content, + schema_type="CSS", + query="Extract job title, location, and salary information", + provider="openai/gpt-4o", # or use other providers like "ollama" + ) + print("\nGenerated CSS Schema:") + print(css_schema) + + # Example of using the generated schema with crawler + crawler = AsyncWebCrawler() + url = "https://example.com/job-listing" + + # Create an extraction strategy with the generated schema + extraction_strategy = JsonCssExtractionStrategy(schema=css_schema) + + config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True) + + print("\nTesting generated schema with crawler...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success: + print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None) + print("Successfully used generated schema for crawling") + + +async def main(): + """Run all feature demonstrations.""" + demo_memory_dispatcher(), + print("\n" + "=" * 50 + "\n") + demo_streaming_support(), + print("\n" + "=" * 50 + "\n") + demo_content_scraping(), + print("\n" + "=" * 50 + "\n") + demo_llm_schema_generation(), + print("\n" + "=" * 50 + "\n") + demo_llm_markdown(), + print("\n" + "=" * 50 + "\n") + demo_robots_compliance(), + print("\n" + "=" * 50 + "\n") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/md_v2/advanced/multi-url-crawling copy.md b/docs/md_v2/advanced/multi-url-crawling copy.md deleted file mode 100644 index a1d2b423..00000000 --- a/docs/md_v2/advanced/multi-url-crawling copy.md +++ /dev/null @@ -1,264 +0,0 @@ -# Optimized Multi-URL Crawling - -> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated. - - -Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to: - -1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation. -2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser. - -When the entire process finishes, you close the browser once—**minimizing** memory and resource usage. - ---- - -## 1. Why Avoid Simple Loops per URL? - -If you naively do: - -```python -for url in urls: - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url) -``` - -You end up: - -1. Spinning up a **new** browser for each URL -2. Closing it immediately after the single crawl -3. Potentially using a lot of CPU/memory for short-living browsers -4. Missing out on session reusability if you have login or ongoing states - -**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead. - ---- - -## 2. Sequential Crawling with Session Reuse - -### 2.1 Overview - -1. **One** `AsyncWebCrawler` instance for **all** URLs. -2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed. -3. The crawler is only closed at the **end**. - -**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs). - -### 2.2 Example Code - -```python -import asyncio -from typing import List -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -async def crawl_sequential(urls: List[str]): - print("\n=== Sequential Crawling with Session Reuse ===") - - browser_config = BrowserConfig( - headless=True, - # For better performance in Docker or low-memory environments: - extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], - ) - - crawl_config = CrawlerRunConfig( - markdown_generator=DefaultMarkdownGenerator() - ) - - # Create the crawler (opens the browser) - crawler = AsyncWebCrawler(config=browser_config) - await crawler.start() - - try: - session_id = "session1" # Reuse the same session across all URLs - for url in urls: - result = await crawler.arun( - url=url, - config=crawl_config, - session_id=session_id - ) - if result.success: - print(f"Successfully crawled: {url}") - # E.g. check markdown length - print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}") - else: - print(f"Failed: {url} - Error: {result.error_message}") - finally: - # After all URLs are done, close the crawler (and the browser) - await crawler.close() - -async def main(): - urls = [ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3" - ] - await crawl_sequential(urls) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**Why It’s Good**: - -- **One** browser launch. -- Minimal memory usage. -- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs. - ---- - -## 3. Parallel Crawling with Browser Reuse - -### 3.1 Overview - -To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task. - -### 3.2 Example Code - -For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package. - -```bash -pip install psutil -``` - -Then you can run the following code: - -```python -import os -import sys -import psutil -import asyncio - -__location__ = os.path.dirname(os.path.abspath(__file__)) -__output__ = os.path.join(__location__, "output") - -# Append parent directory to system path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -from typing import List -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode - -async def crawl_parallel(urls: List[str], max_concurrent: int = 3): - print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===") - - # We'll keep track of peak memory usage across all tasks - peak_memory = 0 - process = psutil.Process(os.getpid()) - - def log_memory(prefix: str = ""): - nonlocal peak_memory - current_mem = process.memory_info().rss # in bytes - if current_mem > peak_memory: - peak_memory = current_mem - print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB") - - # Minimal browser config - browser_config = BrowserConfig( - headless=True, - verbose=False, # corrected from 'verbos=False' - extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], - ) - crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - - # Create the crawler instance - crawler = AsyncWebCrawler(config=browser_config) - await crawler.start() - - try: - # We'll chunk the URLs in batches of 'max_concurrent' - success_count = 0 - fail_count = 0 - for i in range(0, len(urls), max_concurrent): - batch = urls[i : i + max_concurrent] - tasks = [] - - for j, url in enumerate(batch): - # Unique session_id per concurrent sub-task - session_id = f"parallel_session_{i + j}" - task = crawler.arun(url=url, config=crawl_config, session_id=session_id) - tasks.append(task) - - # Check memory usage prior to launching tasks - log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ") - - # Gather results - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Check memory usage after tasks complete - log_memory(prefix=f"After batch {i//max_concurrent + 1}: ") - - # Evaluate results - for url, result in zip(batch, results): - if isinstance(result, Exception): - print(f"Error crawling {url}: {result}") - fail_count += 1 - elif result.success: - success_count += 1 - else: - fail_count += 1 - - print(f"\nSummary:") - print(f" - Successfully crawled: {success_count}") - print(f" - Failed: {fail_count}") - - finally: - print("\nClosing crawler...") - await crawler.close() - # Final memory log - log_memory(prefix="Final: ") - print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}") - -async def main(): - urls = [ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3", - "https://example.com/page4" - ] - await crawl_parallel(urls, max_concurrent=2) - -if __name__ == "__main__": - asyncio.run(main()) - -``` - -**Notes**: - -- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser. -- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired). -- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory. - ---- - -## 4. Performance Tips - -1. **Extra Browser Args** - - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments. - - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems. - -2. **Session Reuse** - - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`. - - If you want isolation (each URL fresh), create unique sessions. - -3. **Batching** - - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`). - - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible. - -4. **Cache** - - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`. - - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`. - -5. **Hooks** - - You can set up global hooks for each crawler (like to block images) or per-run if you want. - - Keep them consistent if you’re reusing sessions. - ---- - -## 5. Summary - -- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL. -- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs. -- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead. -- Close the crawler once at the end, ensuring the browser is only opened/closed once. - -For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling! \ No newline at end of file diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index d9b04535..12c4f916 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -58,7 +58,7 @@ Automatically manages concurrency based on system memory usage: ```python dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=70.0, # Pause if memory exceeds this + memory_threshold_percent=90.0, # Pause if memory exceeds this check_interval=1.0, # How often to check memory max_session_permit=10, # Maximum concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting @@ -79,7 +79,7 @@ Provides simple concurrency control with a fixed limit: ```python dispatcher = SemaphoreDispatcher( - semaphore_count=5, # Fixed concurrent tasks + max_session_permit=5, # Fixed concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting base_delay=(0.5, 1.0), max_delay=10.0 diff --git a/docs/md_v2/blog/releases/v0.4.3b1.md b/docs/md_v2/blog/releases/v0.4.3b1.md new file mode 100644 index 00000000..f648b462 --- /dev/null +++ b/docs/md_v2/blog/releases/v0.4.3b1.md @@ -0,0 +1,266 @@ +# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling! + +Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data. + +Let's dive into what's new! + +## 🚀 Major Feature Highlights + +### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds! + +Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you. + +**How it Works:** + +1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections). +2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation. +3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option. +4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`. + +**Why You'll Love It:** + +- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you! +- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free. +- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layout—far beyond what simple CSS selectors can achieve. +- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas. + +**Example:** + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Sample HTML snippet (imagine this is part of a product listing page) +html = """ +
+

Awesome Gadget

+ $99.99 +
+""" + +# Generate schema using OpenAI +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_provider="openai/gpt-4o", + api_token="YOUR_API_TOKEN" +) + +# Or use Ollama for a local, open-source option +# schema = JsonCssExtractionStrategy.generate_schema( +# html, +# llm_provider="ollama/llama3" +# ) + +print(json.dumps(schema, indent=2)) +``` + +**Output (Schema):** + +```json +{ + "name": null, + "baseSelector": "div.product", + "fields": [ + { + "name": "name", + "selector": "h2.name", + "type": "text" + }, + { + "name": "price", + "selector": "span.price", + "type": "text" + } + ] +} +``` + +You can now **save** this schema and use it for all your extractions on pages with the same structure. No more LLM costs, just **fast, reliable** data extraction! + +### 2. Robots.txt Compliance: Crawl Responsibly + +Crawl4AI now respects website rules! With the new `check_robots_txt=True` option in `CrawlerRunConfig`, the crawler automatically fetches, parses, and obeys each site's `robots.txt` file. + +**Key Features**: + +- **Efficient Caching**: Stores parsed `robots.txt` files locally for 7 days to avoid re-fetching. +- **Automatic Integration**: Works seamlessly with both `arun()` and `arun_many()`. +- **Clear Status Codes**: Returns a 403 status code if a URL is disallowed. +- **Customizable**: Adjust the cache directory and TTL if needed. + +**Example**: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + check_robots_txt=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/private-page", config=config) + if result.status_code == 403: + print("Access denied by robots.txt") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 3. Proxy Support in `CrawlerRunConfig` + +Need more control over your proxy settings? Now you can configure proxies directly within `CrawlerRunConfig` for each crawl: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + proxy_config={ + "server": "http://your-proxy.com:8080", + "username": "your_username", # Optional + "password": "your_password" # Optional + } + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) +``` + +This allows for dynamic proxy assignment per URL or even per request. + +### 4. LLM-Powered Markdown Filtering (Beta) + +We're introducing an experimental **`LLMContentFilter`**! This filter, when used with the `DefaultMarkdownGenerator`, can produce highly focused markdown output by using an LLM to analyze content relevance. + +**How it Works:** + +1. You provide an **instruction** (e.g., "extract only the key technical details"). +2. The LLM analyzes each section of the page based on your instruction. +3. Only the most relevant content is included in the final `fit_markdown`. + +**Example**: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import LLMContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + llm_filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token="YOUR_API_TOKEN", # Or use "ollama/llama3" with no token + instruction="Extract the core educational content about Python classes." + ) + + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator(content_filter=llm_filter) + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://docs.python.org/3/tutorial/classes.html", + config=config + ) + print(result.markdown_v2.fit_markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Note**: This is a beta feature. We're actively working on improving its accuracy and performance. + +### 5. Streamlined `arun_many()` with Dispatchers + +We've simplified concurrent crawling! `arun_many()` now intelligently handles multiple URLs, either returning a **list** of results or an **async generator** for streaming. + +**Basic Usage (Batch)**: + +```python +results = await crawler.arun_many( + urls=["https://site1.com", "https://site2.com"], + config=CrawlerRunConfig() +) + +for res in results: + print(res.url, "crawled successfully:", res.success) +``` + +**Streaming Mode**: + +```python +async for result in await crawler.arun_many( + urls=["https://site1.com", "https://site2.com"], + config=CrawlerRunConfig(stream=True) +): + print("Just finished:", result.url) + # Process each result immediately +``` + +**Advanced:** You can now customize how `arun_many` handles concurrency by passing a **dispatcher**. See [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) for details. + +### 6. Enhanced Browser Context Management + +We've improved how Crawl4AI manages browser contexts for better resource utilization and session handling. + +- **`shared_data` in `CrawlerRunConfig`**: Pass data between hooks using the `shared_data` dictionary. +- **Context Reuse**: The crawler now intelligently reuses browser contexts based on configuration, reducing overhead. + +### 7. Faster Scraping with `LXMLWebScrapingStrategy` + +Introducing a new, optional **`LXMLWebScrapingStrategy`** that can be **10-20x faster** than the default BeautifulSoup approach for large, complex pages. + +**How to Use**: + +```python +from crawl4ai import LXMLWebScrapingStrategy + +config = CrawlerRunConfig( + scraping_strategy=LXMLWebScrapingStrategy() # Add this line +) +``` + +**When to Use**: +- If profiling shows a bottleneck in `WebScrapingStrategy`. +- For very large HTML documents where parsing speed matters. + +**Caveats**: +- It might not handle malformed HTML as gracefully as BeautifulSoup. +- We're still gathering data, so report any issues! + +--- + +## Try the Feature Demo Script! + +We've prepared a Python script demonstrating these new features. You can find it at: + +[**`features_demo.py`**](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/0_4_3b1_feature_demo.py) + +**To run the demo:** + +1. Make sure you have Crawl4AI installed (`pip install crawl4ai`). +2. Copy the `features_demo.py` script to your local environment. +3. Set your OpenAI API key as an environment variable (if using OpenAI models): + ```bash + export OPENAI_API_KEY="your_api_key" + ``` +4. Run the script: + ```bash + python features_demo.py + ``` + +The script will execute various crawl scenarios, showcasing the new features and printing results to your console. + +## Conclusion + +Crawl4AI version 0.4.3b1 is a major step forward in flexibility, performance, and ease of use. With automatic schema generation, robots.txt handling, advanced content filtering, and streamlined multi-URL crawling, you can build powerful, efficient, and responsible web scrapers. + +We encourage you to try out these new capabilities, explore the updated documentation, and share your feedback! Your input is invaluable as we continue to improve Crawl4AI. + +**Stay Connected:** + +- **Star** us on [GitHub](https://github.com/unclecode/crawl4ai) to show your support! +- **Follow** [@unclecode](https://twitter.com/unclecode) on Twitter for updates and tips. +- **Join** our community on Discord (link coming soon) to discuss your projects and get help. + +Happy crawling! diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index 98a30652..ab8f9b05 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -181,7 +181,7 @@ from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): # Initialize LLM filter with specific instruction filter = LLMContentFilter( - provider="openai/gpt-4", # or your preferred provider + provider="openai/gpt-4o", # or your preferred provider api_token="your-api-token", # or use environment variable instruction=""" Focus on extracting the core educational content. diff --git a/pyproject.toml b/pyproject.toml index c9bd9ad3..328438e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "httpx==0.27.2", ] classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", From 88697c46305dfe787883cf59452592a30265b4d4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 21 Jan 2025 21:20:04 +0800 Subject: [PATCH 04/19] docs(readme): update version and feature announcements for v0.4.3b1 Update README.md to announce version 0.4.3b1 release with new features including: - Memory Dispatcher System - Streaming Support - LLM-Powered Markdown Generation - Schema Generation - Robots.txt Compliance Add detailed version numbering explanation section to help users understand pre-release versions. --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 68cc10a2..66a652ff 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.24x](#-recent-updates) +[✨ Check out latest update v0.4.3b1x](#-recent-updates) -🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://docs.crawl4ai.com/blog) +🎉 **Version 0.4.3b1 is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
🤓 My Personal Story @@ -481,18 +481,66 @@ async def test_news_crawl():
+## ✨ Recent Updates -## ✨ Recent Updates +- **🚀 New Dispatcher System**: Scale to thousands of URLs with intelligent **memory monitoring**, **concurrency control**, and optional **rate limiting**. (See `MemoryAdaptiveDispatcher`, `SemaphoreDispatcher`, `RateLimiter`, `CrawlerMonitor`) +- **⚡ Streaming Mode**: Process results **as they arrive** instead of waiting for an entire batch to complete. (Set `stream=True` in `CrawlerRunConfig`) +- **🤖 Enhanced LLM Integration**: + - **Automatic schema generation**: Create extraction rules from HTML using OpenAI or Ollama, no manual CSS/XPath needed. + - **LLM-powered Markdown filtering**: Refine your markdown output with a new `LLMContentFilter` that understands content relevance. + - **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction. +- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). +- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. +- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects. +- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. +- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. +- **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials. -- 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling -- 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies -- 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction -- 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types -- ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management -- 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking -- 🔐 **Security Features**: Improved input validation and safe expression evaluation +Read the full details in our [0.4.248 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). -Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). +Here's a clear markdown explanation for your users about version numbering: + + +## Version Numbering in Crawl4AI + +Crawl4AI follows standard Python version numbering conventions (PEP 440) to help users understand the stability and features of each release. + +### Version Numbers Explained + +Our version numbers follow this pattern: `MAJOR.MINOR.PATCH` (e.g., 0.4.3) + +#### Pre-release Versions +We use different suffixes to indicate development stages: + +- `dev` (0.4.3dev1): Development versions, unstable +- `a` (0.4.3a1): Alpha releases, experimental features +- `b` (0.4.3b1): Beta releases, feature complete but needs testing +- `rc` (0.4.3rc1): Release candidates, potential final version + +#### Installation +- Regular installation (stable version): + ```bash + pip install -U crawl4ai + ``` + +- Install pre-release versions: + ```bash + pip install crawl4ai --pre + ``` + +- Install specific version: + ```bash + pip install crawl4ai==0.4.3b1 + ``` + +#### Why Pre-releases? +We use pre-releases to: +- Test new features in real-world scenarios +- Gather feedback before final releases +- Ensure stability for production users +- Allow early adopters to try new features + +For production environments, we recommend using the stable version. For testing new features, you can opt-in to pre-releases using the `--pre` flag. ## 📖 Documentation & Roadmap From dee5fe9851c1a38225531b53f82bdae3aaf5f33b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 16:11:01 +0800 Subject: [PATCH 05/19] feat(proxy): add proxy rotation support and documentation Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request. --- README.md | 1 + crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 7 +++- docs/examples/v0_4_3_features_demo.py | 60 +++++++++++++++++++++++++++ docs/md_v2/advanced/proxy-security.md | 2 +- 5 files changed, 68 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66a652ff..9cfe4512 100644 --- a/README.md +++ b/README.md @@ -491,6 +491,7 @@ async def test_news_crawl(): - **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction. - **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). - **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. +- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence. - **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects. - **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. - **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 5d2b86af..a0acc761 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b1" +__version__ = "0.4.3b2" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index ae1788f1..a2bb7b96 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -633,9 +633,12 @@ class BrowserManager: if crawlerRunConfig.proxy_config: proxy_settings = { "server": crawlerRunConfig.proxy_config.get("server"), - "username": crawlerRunConfig.proxy_config.get("username"), - "password": crawlerRunConfig.proxy_config.get("password"), } + if crawlerRunConfig.proxy_config.get("username"): + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.get("username"), + "password": crawlerRunConfig.proxy_config.get("password"), + }) context_settings["proxy"] = proxy_settings if self.config.text_mode: diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py index 2ffaa172..033bf30f 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3_features_demo.py @@ -233,6 +233,64 @@ async def demo_llm_schema_generation(): print("Successfully used generated schema for crawling") +async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: + """Get next proxy from local file""" + try: + with open(proxy_file) as f: + proxies = f.read().splitlines() + if not proxies: + return None + + ip, port, username, password = random.choice(proxies).split(":") + return { + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + "ip": ip # Store original IP for verification + } + except Exception as e: + print(f"Error loading proxy: {e}") + return None + +async def demo_proxy_rotation(): + """ + 8. Proxy Rotation Demo + =================== + Demonstrates how to rotate proxies for each request using Crawl4ai. + """ + print("\n=== 8. Proxy Rotation Demo ===") + + + # Create 10 test requests to httpbin + urls = ["https://httpbin.org/ip"] * 3 + + browser_config = BrowserConfig(headless=True) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + for url in urls: + proxy = await get_next_proxy() + if not proxy: + print("No proxy available, skipping...") + continue + + # Create new config with proxy + current_config = run_config.clone(proxy_config=proxy) + result = await crawler.arun(url=url, config=current_config) + + if result.success: + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}") + verified = ip_match.group(0) == proxy['ip'] + if verified: + print(f"✅ Proxy working! IP matches: {proxy['ip']}") + else: + print(f"❌ Proxy failed or IP mismatch!") + else: + print(f"Failed with proxy {proxy['ip']}") + +if __name__ == "__main__": + async def main(): """Run all feature demonstrations.""" demo_memory_dispatcher(), @@ -247,6 +305,8 @@ async def main(): print("\n" + "=" * 50 + "\n") demo_robots_compliance(), print("\n" + "=" * 50 + "\n") + demo_proxy_rotation() + print("\n" + "=" * 50 + "\n") if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index 9b64fd84..0e56572c 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -38,7 +38,7 @@ async with AsyncWebCrawler(config=browser_config) as crawler: Here's the corrected documentation: -## Rotating Proxies [COMING SOON] +## Rotating Proxies Example using a proxy rotation service dynamically: From 2d69bf2366cf5c2524958d306ca942c3131a4cb7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 17:14:24 +0800 Subject: [PATCH 06/19] refactor(models): rename final_url to redirected_url for consistency Renames the final_url field to redirected_url across all components to maintain consistent terminology throughout the codebase. This change affects: - AsyncCrawlResponse model - AsyncPlaywrightCrawlerStrategy - Documentation and examples No functional changes, purely naming consistency improvement. --- CHANGELOG.md | 2 +- README.md | 2 +- crawl4ai/async_crawler_strategy.py | 6 +- crawl4ai/async_webcrawler.py | 2 +- crawl4ai/models.py | 2 +- docs/examples/v0_4_3_features_demo.py | 192 +++++++++------ docs/md_v2/blog/releases/v0.4.3b1.md | 334 ++++++++------------------ 7 files changed, 226 insertions(+), 314 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d62d8775..a9d363c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ This release introduces several powerful new features, including robots.txt comp - **URL Redirection Tracking:** - Added URL redirection tracking to capture the final URL after any redirects. - - The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object. + - The final URL is now available in the `redirected_url` field of the `AsyncCrawlResponse` object. - **Enhanced Streamlined Documentation:** - Refactored and improved the documentation structure for clarity and ease of use. diff --git a/README.md b/README.md index 9cfe4512..1bcaf910 100644 --- a/README.md +++ b/README.md @@ -492,7 +492,7 @@ async def test_news_crawl(): - **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). - **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. - **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence. -- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects. +- **➡️ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects. - **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. - **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. - **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials. diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a2bb7b96..738dfb51 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1254,7 +1254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): config.url = url response_headers = {} status_code = None - final_url = url + redirected_url = url # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -1336,7 +1336,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout ) - final_url = page.url + redirected_url = page.url except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") @@ -1616,7 +1616,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): downloaded_files=( self._downloaded_files if self._downloaded_files else None ), - final_url=final_url, + redirected_url=redirected_url, ) except Exception as e: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index dc7e2cb9..617b6901 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -462,7 +462,7 @@ class AsyncWebCrawler: ) crawl_result.status_code = async_response.status_code - crawl_result.redirected_url = async_response.final_url or url + crawl_result.redirected_url = async_response.redirected_url or url crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.ssl_certificate = ( diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 81e08b0c..57edacd7 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -132,7 +132,7 @@ class AsyncCrawlResponse(BaseModel): get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None ssl_certificate: Optional[SSLCertificate] = None - final_url: Optional[str] = None + redirected_url: Optional[str] = None class Config: arbitrary_types_allowed = True diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py index 033bf30f..9406b50d 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3_features_demo.py @@ -2,54 +2,96 @@ Crawl4ai v0.4.3 Features Demo ============================ -This example demonstrates the major new features introduced in Crawl4ai v0.4.3. -Each section showcases a specific feature with practical examples and explanations. +This demonstration showcases three major categories of new features in Crawl4ai v0.4.3: + +1. Efficiency & Speed: + - Memory-efficient dispatcher strategies + - New scraping algorithm + - Streaming support for batch crawling + +2. LLM Integration: + - Automatic schema generation + - LLM-powered content filtering + - Smart markdown generation + +3. Core Improvements: + - Robots.txt compliance + - Proxy rotation + - Enhanced URL handling + +Each demo function can be run independently or as part of the full suite. """ import asyncio import os -from crawl4ai import * +import json +import re +import random +from typing import Optional, Dict +from dotenv import load_dotenv + +load_dotenv() + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DisplayMode, + MemoryAdaptiveDispatcher, + CrawlerMonitor, + DefaultMarkdownGenerator, + LXMLWebScrapingStrategy, + JsonCssExtractionStrategy, + LLMContentFilter +) async def demo_memory_dispatcher(): + """Demonstrates the new memory-efficient dispatcher system. + + Key Features: + - Adaptive memory management + - Real-time performance monitoring + - Concurrent session control """ - 1. Memory Dispatcher System Demo - =============================== - Shows how to use the new memory dispatcher with monitoring - """ - print("\n=== 1. Memory Dispatcher System Demo ===") - - # Configure crawler - browser_config = BrowserConfig(headless=True, verbose=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator() - ) - - # Test URLs - urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 - - async with AsyncWebCrawler(config=browser_config) as crawler: - # Initialize dispatcher with monitoring - monitor = CrawlerMonitor( - max_visible_rows=10, - display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED + print("\n=== Memory Dispatcher Demo ===") + + try: + # Configuration + browser_config = BrowserConfig(headless=True, verbose=False) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator() ) - dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=80.0, # Memory usage threshold - check_interval=0.5, # How often to check memory - max_session_permit=5, # Max concurrent crawls - monitor=monitor, # Pass the monitor - ) + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 - # Run with memory monitoring - print("Starting batch crawl with memory monitoring...") - results = await dispatcher.run_urls( - urls=urls, - crawler=crawler, - config=crawler_config, - ) - print(f"Completed {len(results)} URLs") + print("\n📈 Initializing crawler with memory monitoring...") + async with AsyncWebCrawler(config=browser_config) as crawler: + monitor = CrawlerMonitor( + max_visible_rows=10, + display_mode=DisplayMode.DETAILED + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, + check_interval=0.5, + max_session_permit=5, + monitor=monitor + ) + + print("\n🚀 Starting batch crawl...") + results = await dispatcher.run_urls( + urls=urls, + crawler=crawler, + config=crawler_config, + ) + print(f"\n✅ Completed {len(results)} URLs successfully") + + except Exception as e: + print(f"\n❌ Error in memory dispatcher demo: {str(e)}") async def demo_streaming_support(): @@ -60,7 +102,7 @@ async def demo_streaming_support(): """ print("\n=== 2. Streaming Support Demo ===") - browser_config = BrowserConfig(headless=True, verbose=True) + browser_config = BrowserConfig(headless=True, verbose=False) crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True) # Test URLs @@ -179,7 +221,7 @@ async def demo_robots_compliance(): -async def demo_llm_schema_generation(): +async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo ================================= @@ -233,25 +275,6 @@ async def demo_llm_schema_generation(): print("Successfully used generated schema for crawling") -async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: - """Get next proxy from local file""" - try: - with open(proxy_file) as f: - proxies = f.read().splitlines() - if not proxies: - return None - - ip, port, username, password = random.choice(proxies).split(":") - return { - "server": f"http://{ip}:{port}", - "username": username, - "password": password, - "ip": ip # Store original IP for verification - } - except Exception as e: - print(f"Error loading proxy: {e}") - return None - async def demo_proxy_rotation(): """ 8. Proxy Rotation Demo @@ -259,12 +282,28 @@ async def demo_proxy_rotation(): Demonstrates how to rotate proxies for each request using Crawl4ai. """ print("\n=== 8. Proxy Rotation Demo ===") + + async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: + """Get next proxy from local file""" + try: + proxies = os.getenv("PROXIES", "").split(",") + + ip, port, username, password = random.choice(proxies).split(":") + return { + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + "ip": ip # Store original IP for verification + } + except Exception as e: + print(f"Error loading proxy: {e}") + return None # Create 10 test requests to httpbin - urls = ["https://httpbin.org/ip"] * 3 + urls = ["https://httpbin.org/ip"] * 2 - browser_config = BrowserConfig(headless=True) + browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler(config=browser_config) as crawler: @@ -289,24 +328,25 @@ async def demo_proxy_rotation(): else: print(f"Failed with proxy {proxy['ip']}") -if __name__ == "__main__": - async def main(): """Run all feature demonstrations.""" - demo_memory_dispatcher(), - print("\n" + "=" * 50 + "\n") - demo_streaming_support(), - print("\n" + "=" * 50 + "\n") - demo_content_scraping(), - print("\n" + "=" * 50 + "\n") - demo_llm_schema_generation(), - print("\n" + "=" * 50 + "\n") - demo_llm_markdown(), - print("\n" + "=" * 50 + "\n") - demo_robots_compliance(), - print("\n" + "=" * 50 + "\n") - demo_proxy_rotation() - print("\n" + "=" * 50 + "\n") + print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n") + + # Efficiency & Speed Demos + # print("\n🚀 EFFICIENCY & SPEED DEMOS") + # await demo_memory_dispatcher() + # await demo_streaming_support() + # await demo_content_scraping() + + # # LLM Integration Demos + # print("\n🤖 LLM INTEGRATION DEMOS") + # await demo_json_schema_generation() + # await demo_llm_markdown() + + # # Core Improvements + # print("\n🔧 CORE IMPROVEMENT DEMOS") + # await demo_robots_compliance() + await demo_proxy_rotation() if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v2/blog/releases/v0.4.3b1.md b/docs/md_v2/blog/releases/v0.4.3b1.md index f648b462..9b027dd6 100644 --- a/docs/md_v2/blog/releases/v0.4.3b1.md +++ b/docs/md_v2/blog/releases/v0.4.3b1.md @@ -1,266 +1,138 @@ -# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling! +# Crawl4AI 0.4.3: Major Performance Boost & LLM Integration -Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data. +We're excited to announce Crawl4AI 0.4.3, focusing on three key areas: Speed & Efficiency, LLM Integration, and Core Platform Improvements. This release significantly improves crawling performance while adding powerful new LLM-powered features. -Let's dive into what's new! +## ⚡ Speed & Efficiency Improvements -## 🚀 Major Feature Highlights - -### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds! - -Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you. - -**How it Works:** - -1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections). -2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation. -3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option. -4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`. - -**Why You'll Love It:** - -- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you! -- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free. -- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layout—far beyond what simple CSS selectors can achieve. -- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas. - -**Example:** +### 1. Memory-Adaptive Dispatcher System +The new dispatcher system provides intelligent resource management and real-time monitoring: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -# Sample HTML snippet (imagine this is part of a product listing page) -html = """ -
-

Awesome Gadget

- $99.99 -
-""" - -# Generate schema using OpenAI -schema = JsonCssExtractionStrategy.generate_schema( - html, - llm_provider="openai/gpt-4o", - api_token="YOUR_API_TOKEN" -) - -# Or use Ollama for a local, open-source option -# schema = JsonCssExtractionStrategy.generate_schema( -# html, -# llm_provider="ollama/llama3" -# ) - -print(json.dumps(schema, indent=2)) -``` - -**Output (Schema):** - -```json -{ - "name": null, - "baseSelector": "div.product", - "fields": [ - { - "name": "name", - "selector": "h2.name", - "type": "text" - }, - { - "name": "price", - "selector": "span.price", - "type": "text" - } - ] -} -``` - -You can now **save** this schema and use it for all your extractions on pages with the same structure. No more LLM costs, just **fast, reliable** data extraction! - -### 2. Robots.txt Compliance: Crawl Responsibly - -Crawl4AI now respects website rules! With the new `check_robots_txt=True` option in `CrawlerRunConfig`, the crawler automatically fetches, parses, and obeys each site's `robots.txt` file. - -**Key Features**: - -- **Efficient Caching**: Stores parsed `robots.txt` files locally for 7 days to avoid re-fetching. -- **Automatic Integration**: Works seamlessly with both `arun()` and `arun_many()`. -- **Clear Status Codes**: Returns a 403 status code if a URL is disallowed. -- **Customizable**: Adjust the cache directory and TTL if needed. - -**Example**: - -```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DisplayMode +from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor async def main(): - config = CrawlerRunConfig( - cache_mode=CacheMode.ENABLED, - check_robots_txt=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://example.com/private-page", config=config) - if result.status_code == 403: - print("Access denied by robots.txt") - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### 3. Proxy Support in `CrawlerRunConfig` - -Need more control over your proxy settings? Now you can configure proxies directly within `CrawlerRunConfig` for each crawl: - -```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - -async def main(): - config = CrawlerRunConfig( - proxy_config={ - "server": "http://your-proxy.com:8080", - "username": "your_username", # Optional - "password": "your_password" # Optional - } - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://example.com", config=config) -``` - -This allows for dynamic proxy assignment per URL or even per request. - -### 4. LLM-Powered Markdown Filtering (Beta) - -We're introducing an experimental **`LLMContentFilter`**! This filter, when used with the `DefaultMarkdownGenerator`, can produce highly focused markdown output by using an LLM to analyze content relevance. - -**How it Works:** - -1. You provide an **instruction** (e.g., "extract only the key technical details"). -2. The LLM analyzes each section of the page based on your instruction. -3. Only the most relevant content is included in the final `fit_markdown`. - -**Example**: - -```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.content_filter_strategy import LLMContentFilter -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -async def main(): - llm_filter = LLMContentFilter( - provider="openai/gpt-4o", - api_token="YOUR_API_TOKEN", # Or use "ollama/llama3" with no token - instruction="Extract the core educational content about Python classes." - ) - - config = CrawlerRunConfig( - markdown_generator=DefaultMarkdownGenerator(content_filter=llm_filter) - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://docs.python.org/3/tutorial/classes.html", - config=config + urls = ["https://example1.com", "https://example2.com"] * 50 + + # Configure memory-aware dispatch + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, # Auto-throttle at 80% memory + check_interval=0.5, # Check every 0.5 seconds + max_session_permit=20, # Max concurrent sessions + monitor=CrawlerMonitor( # Real-time monitoring + display_mode=DisplayMode.DETAILED + ) + ) + + async with AsyncWebCrawler() as crawler: + results = await dispatcher.run_urls( + urls=urls, + crawler=crawler, + config=CrawlerRunConfig() ) - print(result.markdown_v2.fit_markdown) - -if __name__ == "__main__": - asyncio.run(main()) ``` -**Note**: This is a beta feature. We're actively working on improving its accuracy and performance. - -### 5. Streamlined `arun_many()` with Dispatchers - -We've simplified concurrent crawling! `arun_many()` now intelligently handles multiple URLs, either returning a **list** of results or an **async generator** for streaming. - -**Basic Usage (Batch)**: +### 2. Streaming Support +Process crawled URLs in real-time instead of waiting for all results: ```python -results = await crawler.arun_many( - urls=["https://site1.com", "https://site2.com"], - config=CrawlerRunConfig() -) +config = CrawlerRunConfig(stream=True) -for res in results: - print(res.url, "crawled successfully:", res.success) +async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many(urls, config=config): + print(f"Got result for {result.url}") + # Process each result immediately ``` -**Streaming Mode**: +### 3. LXML-Based Scraping +New LXML scraping strategy offering up to 20x faster parsing: ```python -async for result in await crawler.arun_many( - urls=["https://site1.com", "https://site2.com"], - config=CrawlerRunConfig(stream=True) -): - print("Just finished:", result.url) - # Process each result immediately -``` - -**Advanced:** You can now customize how `arun_many` handles concurrency by passing a **dispatcher**. See [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) for details. - -### 6. Enhanced Browser Context Management - -We've improved how Crawl4AI manages browser contexts for better resource utilization and session handling. - -- **`shared_data` in `CrawlerRunConfig`**: Pass data between hooks using the `shared_data` dictionary. -- **Context Reuse**: The crawler now intelligently reuses browser contexts based on configuration, reducing overhead. - -### 7. Faster Scraping with `LXMLWebScrapingStrategy` - -Introducing a new, optional **`LXMLWebScrapingStrategy`** that can be **10-20x faster** than the default BeautifulSoup approach for large, complex pages. - -**How to Use**: - -```python -from crawl4ai import LXMLWebScrapingStrategy - config = CrawlerRunConfig( - scraping_strategy=LXMLWebScrapingStrategy() # Add this line + scraping_strategy=LXMLWebScrapingStrategy(), + cache_mode=CacheMode.ENABLED ) ``` -**When to Use**: -- If profiling shows a bottleneck in `WebScrapingStrategy`. -- For very large HTML documents where parsing speed matters. +## 🤖 LLM Integration -**Caveats**: -- It might not handle malformed HTML as gracefully as BeautifulSoup. -- We're still gathering data, so report any issues! +### 1. LLM-Powered Markdown Generation +Smart content filtering and organization using LLMs: ---- +```python +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=LLMContentFilter( + provider="openai/gpt-4o", + instruction="Extract technical documentation and code examples" + ) + ) +) +``` -## Try the Feature Demo Script! +### 2. Automatic Schema Generation +Generate extraction schemas instantly using LLMs instead of manual CSS/XPath writing: -We've prepared a Python script demonstrating these new features. You can find it at: +```python +schema = JsonCssExtractionStrategy.generate_schema( + html_content, + schema_type="CSS", + query="Extract product name, price, and description" +) +``` -[**`features_demo.py`**](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/0_4_3b1_feature_demo.py) +## 🔧 Core Improvements -**To run the demo:** +### 1. Proxy Support & Rotation +Integrated proxy support with automatic rotation and verification: -1. Make sure you have Crawl4AI installed (`pip install crawl4ai`). -2. Copy the `features_demo.py` script to your local environment. -3. Set your OpenAI API key as an environment variable (if using OpenAI models): - ```bash - export OPENAI_API_KEY="your_api_key" - ``` -4. Run the script: - ```bash - python features_demo.py - ``` +```python +config = CrawlerRunConfig( + proxy_config={ + "server": "http://proxy:8080", + "username": "user", + "password": "pass" + } +) +``` -The script will execute various crawl scenarios, showcasing the new features and printing results to your console. +### 2. Robots.txt Compliance +Built-in robots.txt support with SQLite caching: -## Conclusion +```python +config = CrawlerRunConfig(check_robots_txt=True) +result = await crawler.arun(url, config=config) +if result.status_code == 403: + print("Access blocked by robots.txt") +``` -Crawl4AI version 0.4.3b1 is a major step forward in flexibility, performance, and ease of use. With automatic schema generation, robots.txt handling, advanced content filtering, and streamlined multi-URL crawling, you can build powerful, efficient, and responsible web scrapers. +### 3. URL Redirection Tracking +Track final URLs after redirects: -We encourage you to try out these new capabilities, explore the updated documentation, and share your feedback! Your input is invaluable as we continue to improve Crawl4AI. +```python +result = await crawler.arun(url) +print(f"Initial URL: {url}") +print(f"Final URL: {result.redirected_url}") +``` -**Stay Connected:** +## Performance Impact -- **Star** us on [GitHub](https://github.com/unclecode/crawl4ai) to show your support! -- **Follow** [@unclecode](https://twitter.com/unclecode) on Twitter for updates and tips. -- **Join** our community on Discord (link coming soon) to discuss your projects and get help. +- Memory usage reduced by up to 40% with adaptive dispatcher +- Parsing speed increased up to 20x with LXML strategy +- Streaming reduces memory footprint for large crawls by ~60% -Happy crawling! +## Getting Started + +```bash +pip install -U crawl4ai +``` + +For complete examples, check our [demo repository](https://github.com/unclecode/crawl4ai/examples). + +## Stay Connected + +- Star us on [GitHub](https://github.com/unclecode/crawl4ai) +- Follow [@unclecode](https://twitter.com/unclecode) +- Join our [Discord](https://discord.gg/crawl4ai) + +Happy crawling! 🕷️ \ No newline at end of file From 976ea5216767cbd09a9da79588ae2b6412c2a87c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 20:40:03 +0800 Subject: [PATCH 07/19] docs(examples): update demo scripts and fix output formats Update example scripts to reflect latest API changes and improve demonstrations: - Increase test URLs in dispatcher example from 20 to 40 pages - Comment out unused dispatcher strategies for cleaner output - Fix scraping strategies performance script to use correct object notation - Update v0_4_3_features_demo with additional feature mentions and uncomment demo sections These changes make the examples more current and better aligned with the actual API. --- docs/examples/dispatcher_example.py | 16 +++++++-------- .../scraping_strategies_performance.py | 14 ++++++------- docs/examples/v0_4_3_features_demo.py | 20 ++++++++++--------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py index ae6406bb..cac08186 100644 --- a/docs/examples/dispatcher_example.py +++ b/docs/examples/dispatcher_example.py @@ -112,19 +112,19 @@ def create_performance_table(results): async def main(): - urls = [f"https://example.com/page{i}" for i in range(1, 20)] + urls = [f"https://example.com/page{i}" for i in range(1, 40)] browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy()) results = { "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config), - "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit( - urls, browser_config, run_config - ), - "Semaphore": await semaphore(urls, browser_config, run_config), - "Semaphore + Rate Limit": await semaphore_with_rate_limit( - urls, browser_config, run_config - ), + # "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit( + # urls, browser_config, run_config + # ), + # "Semaphore": await semaphore(urls, browser_config, run_config), + # "Semaphore + Rate Limit": await semaphore_with_rate_limit( + # urls, browser_config, run_config + # ), } table = create_performance_table(results) diff --git a/docs/examples/scraping_strategies_performance.py b/docs/examples/scraping_strategies_performance.py index b8c80be2..87fb8ac5 100644 --- a/docs/examples/scraping_strategies_performance.py +++ b/docs/examples/scraping_strategies_performance.py @@ -117,17 +117,17 @@ def test_scraping(): timing_stats.report() # Print stats of LXML output - print("\nLXML Output:") - print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}") - print(f"Extracted images: {len(result_selected['media']['images'])}") - print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB") + print("\Turbo Output:") + print(f"\nExtracted links: {len(result_selected.links.internal) + len(result_selected.links.external)}") + print(f"Extracted images: {len(result_selected.media.images)}") + print(f"Clean HTML size: {len(result_selected.cleaned_html)/1024:.2f} KB") print(f"Scraping time: {t2 - t1:.2f} seconds") # Print stats of original output print("\nOriginal Output:") - print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}") - print(f"Extracted images: {len(result_original['media']['images'])}") - print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB") + print(f"\nExtracted links: {len(result_original.links.internal) + len(result_original.links.external)}") + print(f"Extracted images: {len(result_original.media.images)}") + print(f"Clean HTML size: {len(result_original.cleaned_html)/1024:.2f} KB") print(f"Scraping time: {t3 - t1:.2f} seconds") diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py index 9406b50d..6e7a8a0a 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3_features_demo.py @@ -18,6 +18,8 @@ This demonstration showcases three major categories of new features in Crawl4ai - Robots.txt compliance - Proxy rotation - Enhanced URL handling + - Shared data among hooks + - add page routes Each demo function can be run independently or as part of the full suite. """ @@ -333,19 +335,19 @@ async def main(): print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n") # Efficiency & Speed Demos - # print("\n🚀 EFFICIENCY & SPEED DEMOS") - # await demo_memory_dispatcher() - # await demo_streaming_support() - # await demo_content_scraping() + print("\n🚀 EFFICIENCY & SPEED DEMOS") + await demo_memory_dispatcher() + await demo_streaming_support() + await demo_content_scraping() # # LLM Integration Demos - # print("\n🤖 LLM INTEGRATION DEMOS") - # await demo_json_schema_generation() - # await demo_llm_markdown() + print("\n🤖 LLM INTEGRATION DEMOS") + await demo_json_schema_generation() + await demo_llm_markdown() # # Core Improvements - # print("\n🔧 CORE IMPROVEMENT DEMOS") - # await demo_robots_compliance() + print("\n🔧 CORE IMPROVEMENT DEMOS") + await demo_robots_compliance() await demo_proxy_rotation() if __name__ == "__main__": From 260b9120c388de5eac8fe1d3cf32bce24977ed41 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 20:41:43 +0800 Subject: [PATCH 08/19] docs(examples): update v0.4.3 features demo to v0.4.3b2 Rename and replace the features demo file to reflect the beta 2 version number. The old v0.4.3 demo file is removed and replaced with a new beta 2 version. Renames: - docs/examples/v0_4_3_features_demo.py -> docs/examples/v0_4_3b2_features_demo.py --- .../{v0_4_3_features_demo.py => v0_4_3b2_features_demo.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/examples/{v0_4_3_features_demo.py => v0_4_3b2_features_demo.py} (99%) diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py similarity index 99% rename from docs/examples/v0_4_3_features_demo.py rename to docs/examples/v0_4_3b2_features_demo.py index 6e7a8a0a..6e091423 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -1,5 +1,5 @@ """ -Crawl4ai v0.4.3 Features Demo +Crawl4ai v0.4.3b2 Features Demo ============================ This demonstration showcases three major categories of new features in Crawl4ai v0.4.3: From 357414c345e527b5145b1e6f8189017e2ab7ad0a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 20:46:39 +0800 Subject: [PATCH 09/19] docs(readme): update version references and fix links Update version numbers to v0.4.3bx throughout README.md Fix contributing guidelines link to point to CONTRIBUTORS.md Update Aravind's role in CONTRIBUTORS.md to Head of Community and Product Add pre-release installation instructions Fix minor formatting in personal story section No breaking changes --- CONTRIBUTORS.md | 2 +- README.md | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 79038bdd..82f677cd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -6,7 +6,7 @@ We would like to thank the following people for their contributions to Crawl4AI: - [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer - [Nasrin](https://github.com/ntohidi) - Project Manager and Developer -- [Aravind Karnam](https://github.com/aravindkarnam) - Developer +- [Aravind Karnam](https://github.com/aravindkarnam) - Head of Community and Product ## Community Contributors diff --git a/README.md b/README.md index 1bcaf910..4fca126b 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.3b1x](#-recent-updates) +[✨ Check out latest update v0.4.3bx](#-recent-updates) -🎉 **Version 0.4.3b1 is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog) +🎉 **Version 0.4.3bx is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
🤓 My Personal Story @@ -31,7 +31,7 @@ My journey with computers started in childhood when my dad, a computer scientist Fast forward to 2023, I was working on a tool for a project and needed a crawler to convert a webpage into markdown. While exploring solutions, I found one that claimed to be open-source but required creating an account and generating an API token. Worse, it turned out to be a SaaS model charging $16, and its quality didn’t meet my standards. Frustrated, I realized this was a deeper problem. That frustration turned into turbo anger mode, and I decided to build my own solution. In just a few days, I created Crawl4AI. To my surprise, it went viral, earning thousands of GitHub stars and resonating with a global community. -I made Crawl4AI open-source for two reasons. First, it’s my way of giving back to the open-source community that has supported me throughout my career. Second, I believe data should be accessible to everyone, not locked behind paywalls or monopolized by a few. Open access to data lays the foundation for the democratization of AI—a vision where individuals can train their own models and take ownership of their information. This library is the first step in a larger journey to create the best open-source data extraction and generation tool the world has ever seen, built collaboratively by a passionate community. +I made Crawl4AI open-source for two reasons. First, it’s my way of giving back to the open-source community that has supported me throughout my career. Second, I believe data should be accessible to everyone, not locked behind paywalls or monopolized by a few. Open access to data lays the foundation for the democratization of AI, a vision where individuals can train their own models and take ownership of their information. This library is the first step in a larger journey to create the best open-source data extraction and generation tool the world has ever seen, built collaboratively by a passionate community. Thank you to everyone who has supported this project, used it, and shared feedback. Your encouragement motivates me to dream even bigger. Join us, file issues, submit PRs, or spread the word. Together, we can build a tool that truly empowers people to access their own data and reshape the future of AI.
@@ -52,6 +52,9 @@ Thank you to everyone who has supported this project, used it, and shared feedba # Install the package pip install -U crawl4ai +# For pre release versions +pip install crawl4ai --pre + # Run post-installation setup crawl4ai-setup @@ -497,10 +500,7 @@ async def test_news_crawl(): - **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. - **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials. -Read the full details in our [0.4.248 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). - -Here's a clear markdown explanation for your users about version numbering: - +Read the full details in our [0.4.3bx Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). ## Version Numbering in Crawl4AI @@ -571,7 +571,7 @@ To check our development plans and upcoming features, visit our [Roadmap](https: ## 🤝 Contributing -We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. +We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information. ## 📄 License From 7b7fe84e0d47fdeeb99e11aa91b5664c5e1c2447 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 20:52:42 +0800 Subject: [PATCH 10/19] docs(readme): resolve merge conflict and update version info Resolves merge conflict in README.md by removing outdated version 0.4.24x information and keeping current version 0.4.3bx details. Updates release notes description to reflect current features including Memory Dispatcher System, Streaming Support, and other improvements. No breaking changes. --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 5cb60452..8987d19d 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,6 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant [✨ Check out latest update v0.4.3bx](#-recent-updates) -<<<<<<< HEAD -🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://docs.crawl4ai.com/blog) -======= 🎉 **Version 0.4.3bx is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
From 6dc01eae3ac77092d6fe3e9f6730cb6afb1ae8d2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 18:53:22 +0800 Subject: [PATCH 11/19] refactor(core): improve type hints and remove unused file - Add RelevantContentFilter to __init__.py exports - Update version to 0.4.3b3 - Enhance type hints in async_configs.py - Remove empty utils.scraping.py file - Update mkdocs configuration with version info and GitHub integration BREAKING CHANGE: None --- crawl4ai/__init__.py | 3 ++- crawl4ai/__version__.py | 2 +- crawl4ai/async_configs.py | 11 +++++++---- crawl4ai/utils.scraping.py | 0 mkdocs.yml | 11 ++++++++++- 5 files changed, 20 insertions(+), 7 deletions(-) delete mode 100644 crawl4ai/utils.scraping.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 482afdd7..7f284323 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -16,7 +16,7 @@ from .extraction_strategy import ( ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -44,6 +44,7 @@ __all__ = [ "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", + "RelevantContentFilter", "PruningContentFilter", "BM25ContentFilter", "LLMContentFilter", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index a0acc761..3274435a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b2" +__version__ = "0.4.3b3" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index b0813abe..c1404026 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -6,12 +6,15 @@ from .config import ( IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) + from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from typing import Optional, Union, List +from .cache_context import CacheMode class BrowserConfig: @@ -81,13 +84,13 @@ class BrowserConfig: user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", - proxy: Optional[str] = None, + proxy: str = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, - storage_state=None, + storage_state : Union[str, dict, None]=None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -382,7 +385,7 @@ class CrawlerRunConfig: extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, - content_filter=None, + content_filter : RelevantContentFilter = None, only_text: bool = False, css_selector: str = None, excluded_tags: list = None, @@ -396,7 +399,7 @@ class CrawlerRunConfig: # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters - cache_mode=None, + cache_mode: CacheMode =None, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mkdocs.yml b/mkdocs.yml index 255492e3..16f44b05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Crawl4AI Documentation +site_name: Crawl4AI Documentation (v0.4.3b2) site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_url: https://docs.crawl4ai.com repo_url: https://github.com/unclecode/crawl4ai @@ -52,6 +52,11 @@ nav: theme: name: 'terminal' palette: 'dark' + icon: + repo: fontawesome/brands/github + +plugins: + - search markdown_extensions: - pymdownx.highlight: @@ -64,6 +69,9 @@ markdown_extensions: - attr_list - tables +extra: + version: !ENV [CRAWL4AI_VERSION, 'development'] + extra_css: - assets/styles.css - assets/highlight.css @@ -72,3 +80,4 @@ extra_css: extra_javascript: - assets/highlight.min.js - assets/highlight_init.js + - https://buttons.github.io/buttons.js \ No newline at end of file From 6a01008a2b518748e662d28cdd6ad4a0f8ab70c0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 22:33:36 +0800 Subject: [PATCH 12/19] docs(multi-url): improve documentation clarity and update examples - Restructure multi-URL crawling documentation with better formatting and examples - Update code examples to use new API syntax (arun_many) - Add detailed parameter explanations for RateLimiter and Dispatchers - Enhance CSS styling for better documentation readability - Fix outdated method calls in feature demo script BREAKING CHANGE: Updated dispatcher.run_urls() to crawler.arun_many() in examples --- docs/examples/v0_4_3b2_features_demo.py | 20 ++- docs/md_v2/advanced/multi-url-crawling.md | 195 +++++++++++++++++++--- docs/md_v2/assets/styles.css | 14 ++ 3 files changed, 201 insertions(+), 28 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 6e091423..a3a7355b 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -85,10 +85,10 @@ async def demo_memory_dispatcher(): ) print("\n🚀 Starting batch crawl...") - results = await dispatcher.run_urls( + results = await crawler.arun_many( urls=urls, - crawler=crawler, config=crawler_config, + dispatcher=dispatcher ) print(f"\n✅ Completed {len(results)} URLs successfully") @@ -115,15 +115,17 @@ async def demo_streaming_support(): dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) print("Starting streaming crawl...") - async for result in dispatcher.run_urls_stream( - urls=urls, crawler=crawler, config=crawler_config + async for result in await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher ): # Process each result as it arrives print( - f"Received result for {result.url} - Success: {result.result.success}" + f"Received result for {result.url} - Success: {result.success}" ) - if result.result.success: - print(f"Content length: {len(result.result.markdown)}") + if result.success: + print(f"Content length: {len(result.markdown)}") async def demo_content_scraping(): @@ -147,6 +149,8 @@ async def demo_content_scraping(): print("Successfully scraped content using LXML strategy") + + async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo @@ -336,7 +340,7 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - await demo_memory_dispatcher() + # await demo_memory_dispatcher() await demo_streaming_support() await demo_content_scraping() diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index 12c4f916..f6d944d6 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -5,16 +5,20 @@ ## 1. Introduction When crawling many URLs: + - **Basic**: Use `arun()` in a loop (simple but less efficient) - **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control - **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.) **Why Dispatchers?** + - **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources - **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses - **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance - **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency +--- + ## 2. Core Components ### 2.1 Rate Limiter @@ -22,34 +26,116 @@ When crawling many URLs: ```python class RateLimiter: def __init__( - base_delay: Tuple[float, float] = (1.0, 3.0), # Random delay range between requests - max_delay: float = 60.0, # Maximum backoff delay - max_retries: int = 3, # Retries before giving up - rate_limit_codes: List[int] = [429, 503] # Status codes triggering backoff + # Random delay range between requests + base_delay: Tuple[float, float] = (1.0, 3.0), + + # Maximum backoff delay + max_delay: float = 60.0, + + # Retries before giving up + max_retries: int = 3, + + # Status codes triggering backoff + rate_limit_codes: List[int] = [429, 503] ) ``` -The RateLimiter provides: -- Random delays between requests -- Exponential backoff on rate limit responses -- Domain-specific rate limiting -- Automatic retry handling +Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines. + +#### RateLimiter Constructor Parameters + +The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters. + +**Parameters of the `RateLimiter` constructor:** + +1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`) +  The range for a random delay (in seconds) between consecutive requests to the same domain. + +- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request. +- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits. + +**Example:** +If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc. + +--- + +2. **`max_delay`** (`float`, default: `60.0`) +  The maximum allowable delay when rate-limiting errors occur. + +- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter. +- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value. + +**Example:** +For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`. + +--- + +3. **`max_retries`** (`int`, default: `3`) +  The maximum number of retries for a request if rate-limiting errors occur. + +- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times. +- If all retries fail, the request is marked as failed, and the process continues. + +**Example:** +If `max_retries = 3`, the system retries a failed request three times before giving up. + +--- + +4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`) +  A list of HTTP status codes that trigger the rate-limiting logic. + +- These status codes indicate the server is overwhelmed or actively limiting requests. +- You can customize this list to include other codes based on specific server behavior. + +**Example:** +If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes. + +--- + +**How to Use the `RateLimiter`:** + +Here’s an example of initializing and using a `RateLimiter` in your project: + +```python +from crawl4ai import RateLimiter + +# Create a RateLimiter with custom settings +rate_limiter = RateLimiter( + base_delay=(2.0, 4.0), # Random delay between 2-4 seconds + max_delay=30.0, # Cap delay at 30 seconds + max_retries=5, # Retry up to 5 times on rate-limiting errors + rate_limit_codes=[429, 503] # Handle these HTTP status codes +) + +# RateLimiter will handle delays and retries internally +# No additional setup is required for its operation +``` + +The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency. + ### 2.2 Crawler Monitor The CrawlerMonitor provides real-time visibility into crawling operations: ```python +from crawl4ai import CrawlerMonitor, DisplayMode monitor = CrawlerMonitor( - max_visible_rows=15, # Maximum rows in live display - display_mode=DisplayMode.DETAILED # DETAILED or AGGREGATED view + # Maximum rows in live display + max_visible_rows=15, + + # DETAILED or AGGREGATED view + display_mode=DisplayMode.DETAILED ) ``` **Display Modes**: + 1. **DETAILED**: Shows individual task status, memory usage, and timing 2. **AGGREGATED**: Displays summary statistics and overall progress +--- + ## 3. Available Dispatchers ### 3.1 MemoryAdaptiveDispatcher (Default) @@ -57,6 +143,8 @@ monitor = CrawlerMonitor( Automatically manages concurrency based on system memory usage: ```python +from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher + dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=90.0, # Pause if memory exceeds this check_interval=1.0, # How often to check memory @@ -73,13 +161,37 @@ dispatcher = MemoryAdaptiveDispatcher( ) ``` +**Constructor Parameters:** + +1. **`memory_threshold_percent`** (`float`, default: `90.0`) +  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload. + +2. **`check_interval`** (`float`, default: `1.0`) +  The interval (in seconds) at which the dispatcher checks system memory usage. + +3. **`max_session_permit`** (`int`, default: `10`) +  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency. + +4. **`memory_wait_timeout`** (`float`, default: `300.0`) +  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised. + +5. **`rate_limiter`** (`RateLimiter`, default: `None`) +  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details. + +6. **`monitor`** (`CrawlerMonitor`, default: `None`) +  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details. + +--- + ### 3.2 SemaphoreDispatcher Provides simple concurrency control with a fixed limit: ```python +from crawl4ai.async_dispatcher import SemaphoreDispatcher + dispatcher = SemaphoreDispatcher( - max_session_permit=5, # Fixed concurrent tasks + max_session_permit=20, # Maximum concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting base_delay=(0.5, 1.0), max_delay=10.0 @@ -91,6 +203,19 @@ dispatcher = SemaphoreDispatcher( ) ``` +**Constructor Parameters:** + +1. **`max_session_permit`** (`int`, default: `20`) +  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots. + +2. **`rate_limiter`** (`RateLimiter`, default: `None`) +  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details. + +3. **`monitor`** (`CrawlerMonitor`, default: `None`) +  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details. + +--- + ## 4. Usage Examples ### 4.1 Batch Processing (Default) @@ -128,6 +253,14 @@ async def crawl_batch(): print(f"Failed to crawl {result.url}: {result.error_message}") ``` +**Review:** +- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete. +- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory. +- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing. +- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl. + +--- + ### 4.2 Streaming Mode ```python @@ -161,6 +294,14 @@ async def crawl_streaming(): print(f"Failed to crawl {result.url}: {result.error_message}") ``` +**Review:** +- **Purpose:** Enables streaming to process results as soon as they’re available. +- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management. +- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling. +- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage. + +--- + ### 4.3 Semaphore-based Crawling ```python @@ -189,6 +330,14 @@ async def crawl_with_semaphore(urls): return results ``` +**Review:** +- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots. +- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks. +- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests. +- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory. + +--- + ### 4.4 Robots.txt Consideration ```python @@ -221,11 +370,13 @@ if __name__ == "__main__": asyncio.run(main()) ``` -**Key Points**: -- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling -- Robots.txt files are cached for efficiency -- Failed robots.txt checks return 403 status code -- Dispatcher handles robots.txt checks automatically for each URL +**Review:** +- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling. +- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling. +- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`). +- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices. + +--- ## 5. Dispatch Results @@ -255,20 +406,24 @@ for result in results: ## 6. Summary -1. **Two Dispatcher Types**: +1. **Two Dispatcher Types**: + - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory - SemaphoreDispatcher: Fixed concurrency limit -2. **Optional Components**: +2. **Optional Components**: + - RateLimiter: Smart request pacing and backoff - CrawlerMonitor: Real-time progress visualization -3. **Key Benefits**: +3. **Key Benefits**: + - Automatic memory management - Built-in rate limiting - Live progress monitoring - Flexible concurrency control Choose the dispatcher that best fits your needs: + - **MemoryAdaptiveDispatcher**: For large crawls or limited resources - **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css index ed7fc12e..8ee8cbb1 100644 --- a/docs/md_v2/assets/styles.css +++ b/docs/md_v2/assets/styles.css @@ -95,6 +95,10 @@ strong { } +div.highlight { + margin-bottom: 2em; +} + .terminal-card > header { color: var(--font-color); text-align: center; @@ -231,6 +235,16 @@ pre { font-size: 2em; } +.terminal h2 { + font-size: 1.5em; + margin-bottom: 0.8em; +} + +.terminal h3 { + font-size: 1.3em; + margin-bottom: 0.8em; +} + .terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 { text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color); } From 65d33bcc0f50c8b7e2ee4c875d70fe3a3c866a94 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 22:36:58 +0800 Subject: [PATCH 13/19] style(docs): improve code formatting in features demo Clean up whitespace and improve readability in v0_4_3b2_features_demo.py: - Remove excessive blank lines between functions - Improve config formatting for better readability - Uncomment memory dispatcher demo in main function No breaking changes. --- docs/examples/v0_4_3b2_features_demo.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index a3a7355b..7771c3f8 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -95,7 +95,6 @@ async def demo_memory_dispatcher(): except Exception as e: print(f"\n❌ Error in memory dispatcher demo: {str(e)}") - async def demo_streaming_support(): """ 2. Streaming Support Demo @@ -127,7 +126,6 @@ async def demo_streaming_support(): if result.success: print(f"Content length: {len(result.markdown)}") - async def demo_content_scraping(): """ 3. Content Scraping Strategy Demo @@ -140,7 +138,10 @@ async def demo_content_scraping(): url = "https://example.com/article" # Configure with the new LXML strategy - config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True) + config = CrawlerRunConfig( + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True + ) print("Scraping content with LXML strategy...") async with crawler: @@ -148,9 +149,6 @@ async def demo_content_scraping(): if result.success: print("Successfully scraped content using LXML strategy") - - - async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo @@ -201,7 +199,6 @@ async def demo_llm_markdown(): print(result.markdown_v2.fit_markdown[:500]) print("Successfully generated LLM-filtered markdown") - async def demo_robots_compliance(): """ 5. Robots.txt Compliance Demo @@ -225,8 +222,6 @@ async def demo_robots_compliance(): elif result.success: print(f"Successfully crawled: {result.url}") - - async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo @@ -280,7 +275,6 @@ async def demo_json_schema_generation(): print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None) print("Successfully used generated schema for crawling") - async def demo_proxy_rotation(): """ 8. Proxy Rotation Demo @@ -340,7 +334,7 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - # await demo_memory_dispatcher() + await demo_memory_dispatcher() await demo_streaming_support() await demo_content_scraping() From 0afc3e9e5e38b09d0995042ecaa9c77de66842e1 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 22:37:29 +0800 Subject: [PATCH 14/19] refactor(examples): update API usage in features demo Update the demo script to use the new crawler.arun_many() API instead of dispatcher.run_urls() and fix result access patterns. Also improve code formatting and remove extra whitespace. - Replace dispatcher.run_urls with crawler.arun_many - Update streaming demo to use new API and correct result access - Clean up whitespace and formatting - Simplify result property access patterns --- docs/examples/v0_4_3b2_features_demo.py | 28 ++++++++++++------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 6e091423..7771c3f8 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -85,17 +85,16 @@ async def demo_memory_dispatcher(): ) print("\n🚀 Starting batch crawl...") - results = await dispatcher.run_urls( + results = await crawler.arun_many( urls=urls, - crawler=crawler, config=crawler_config, + dispatcher=dispatcher ) print(f"\n✅ Completed {len(results)} URLs successfully") except Exception as e: print(f"\n❌ Error in memory dispatcher demo: {str(e)}") - async def demo_streaming_support(): """ 2. Streaming Support Demo @@ -115,16 +114,17 @@ async def demo_streaming_support(): dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) print("Starting streaming crawl...") - async for result in dispatcher.run_urls_stream( - urls=urls, crawler=crawler, config=crawler_config + async for result in await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher ): # Process each result as it arrives print( - f"Received result for {result.url} - Success: {result.result.success}" + f"Received result for {result.url} - Success: {result.success}" ) - if result.result.success: - print(f"Content length: {len(result.result.markdown)}") - + if result.success: + print(f"Content length: {len(result.markdown)}") async def demo_content_scraping(): """ @@ -138,7 +138,10 @@ async def demo_content_scraping(): url = "https://example.com/article" # Configure with the new LXML strategy - config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True) + config = CrawlerRunConfig( + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True + ) print("Scraping content with LXML strategy...") async with crawler: @@ -146,7 +149,6 @@ async def demo_content_scraping(): if result.success: print("Successfully scraped content using LXML strategy") - async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo @@ -197,7 +199,6 @@ async def demo_llm_markdown(): print(result.markdown_v2.fit_markdown[:500]) print("Successfully generated LLM-filtered markdown") - async def demo_robots_compliance(): """ 5. Robots.txt Compliance Demo @@ -221,8 +222,6 @@ async def demo_robots_compliance(): elif result.success: print(f"Successfully crawled: {result.url}") - - async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo @@ -276,7 +275,6 @@ async def demo_json_schema_generation(): print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None) print("Successfully used generated schema for crawling") - async def demo_proxy_rotation(): """ 8. Proxy Rotation Demo From 69a77222efe976b4fb9c3b2074817e858a6d7248 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 24 Jan 2025 15:53:47 +0800 Subject: [PATCH 15/19] feat(browser): add CDP URL configuration support Add support for direct CDP URL configuration in BrowserConfig and ManagedBrowser classes. This allows connecting to remote browser instances using custom CDP endpoints instead of always launching a local browser. - Added cdp_url parameter to BrowserConfig - Added cdp_url support in ManagedBrowser.start() method - Updated documentation for new parameters --- crawl4ai/async_configs.py | 6 ++++++ crawl4ai/async_crawler_strategy.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c1404026..d0a9b9e1 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -32,6 +32,7 @@ class BrowserConfig: Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. + cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. @@ -80,6 +81,7 @@ class BrowserConfig: browser_type: str = "chromium", headless: bool = True, use_managed_browser: bool = False, + cdp_url: str = None, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", @@ -107,10 +109,12 @@ class BrowserConfig: light_mode: bool = False, extra_args: list = None, debugging_port: int = 9222, + host: str = "localhost", ): self.browser_type = browser_type self.headless = headless self.use_managed_browser = use_managed_browser + self.cdp_url = cdp_url self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" @@ -162,6 +166,7 @@ class BrowserConfig: browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), use_managed_browser=kwargs.get("use_managed_browser", False), + cdp_url=kwargs.get("cdp_url"), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), @@ -194,6 +199,7 @@ class BrowserConfig: "browser_type": self.browser_type, "headless": self.headless, "use_managed_browser": self.use_managed_browser, + "cdp_url": self.cdp_url, "use_persistent_context": self.use_persistent_context, "user_data_dir": self.user_data_dir, "chrome_channel": self.chrome_channel, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 738dfb51..b11796e0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -102,6 +102,7 @@ class ManagedBrowser: logger=None, host: str = "localhost", debugging_port: int = 9222, + cdp_url: Optional[str] = None, ): """ Initialize the ManagedBrowser instance. @@ -116,6 +117,7 @@ class ManagedBrowser: logger (logging.Logger): Logger instance for logging messages. Default: None. host (str): Host for debugging the browser. Default: "localhost". debugging_port (int): Port for debugging the browser. Default: 9222. + cdp_url (str or None): CDP URL to connect to the browser. Default: None. """ self.browser_type = browser_type self.user_data_dir = user_data_dir @@ -129,9 +131,16 @@ class ManagedBrowser: async def start(self) -> str: """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. + Starts the browser process or returns CDP endpoint URL. + If cdp_url is provided, returns it directly. + If user_data_dir is not provided for local browser, creates a temporary directory. + + Returns: + str: CDP endpoint URL """ + # If CDP URL provided, just return it + if self.cdp_url: + return self.cdp_url # Create temp dir if needed if not self.user_data_dir: From 4d7f91b3789d645b1a3231552ac46a2c136ee607 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:16:39 +0800 Subject: [PATCH 16/19] refactor(user-agent): improve user agent generation system Redesign user agent generation to be more modular and reliable: - Add abstract base class UAGen for user agent generation - Implement ValidUAGenerator using fake-useragent library - Add OnlineUAGenerator for fetching real-world user agents - Update browser configurations to use new UA generation system - Improve client hints generation This change makes the user agent system more maintainable and provides better real-world user agent coverage. --- crawl4ai/async_configs.py | 43 +++++-- crawl4ai/async_crawler_strategy.py | 14 ++- crawl4ai/user_agent_generator.py | 182 ++++++++++++++++++++++++----- 3 files changed, 196 insertions(+), 43 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d0a9b9e1..44c83262 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -7,7 +7,7 @@ from .config import ( SOCIAL_MEDIA_DOMAINS, ) -from .user_agent_generator import UserAgentGenerator +from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy @@ -100,11 +100,13 @@ class BrowserConfig: cookies: list = None, headers: dict = None, user_agent: str = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36" ), - user_agent_mode: str = None, - user_agent_generator_config: dict = None, + user_agent_mode: str = "", + user_agent_generator_config: dict = {}, text_mode: bool = False, light_mode: bool = False, extra_args: list = None, @@ -143,17 +145,15 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port - user_agenr_generator = UserAgentGenerator() - if self.user_agent_mode != "random" and self.user_agent_generator_config: - self.user_agent = user_agenr_generator.generate( + fa_user_agenr_generator = ValidUAGenerator() + if self.user_agent_mode == "random": + self.user_agent = fa_user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) - elif self.user_agent_mode == "random": - self.user_agent = user_agenr_generator.generate() else: pass - - self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + + self.browser_hint = UAGen.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) # If persistent context is requested, ensure managed browser is enabled @@ -382,6 +382,11 @@ class CrawlerRunConfig: stream (bool): If True, stream the page content as it is being loaded. url: str = None # This is not a compulsory parameter check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False + user_agent (str): Custom User-Agent string to use. Default: None + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. """ def __init__( @@ -453,6 +458,9 @@ class CrawlerRunConfig: stream: bool = False, url: str = None, check_robots_txt: bool = False, + user_agent: str = None, + user_agent_mode: str = None, + user_agent_generator_config: dict = {}, ): self.url = url @@ -535,6 +543,11 @@ class CrawlerRunConfig: # Robots.txt Handling Parameters self.check_robots_txt = check_robots_txt + # User Agent Parameters + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + # Validate type of extraction strategy and chunking strategy if they are provided if self.extraction_strategy is not None and not isinstance( self.extraction_strategy, ExtractionStrategy @@ -632,6 +645,9 @@ class CrawlerRunConfig: stream=kwargs.get("stream", False), url=kwargs.get("url"), check_robots_txt=kwargs.get("check_robots_txt", False), + user_agent=kwargs.get("user_agent"), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), ) # Create a funciton returns dict of the object @@ -695,6 +711,9 @@ class CrawlerRunConfig: "stream": self.stream, "url": self.url, "check_robots_txt": self.check_robots_txt, + "user_agent": self.user_agent, + "user_agent_mode": self.user_agent_mode, + "user_agent_generator_config": self.user_agent_generator_config, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b11796e0..62ee4c65 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -23,6 +23,7 @@ from .async_logger import AsyncLogger from playwright_stealth import StealthConfig from .ssl_certificate import SSLCertificate from .utils import get_home_folder, get_chromium_path +from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator stealth_config = StealthConfig( webdriver=True, @@ -128,6 +129,7 @@ class ManagedBrowser: self.host = host self.logger = logger self.shutting_down = False + self.cdp_url = cdp_url async def start(self) -> str: """ @@ -563,7 +565,7 @@ class BrowserManager: Context: Browser context object with the specified configurations """ # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) viewport_settings = { "width": self.config.viewport_width, "height": self.config.viewport_height, @@ -1269,10 +1271,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._downloaded_files = [] # Handle user agent with magic mode - user_agent = self.browser_config.user_agent - if config.magic and self.browser_config.user_agent_mode != "random": - self.browser_config.user_agent = UserAgentGenerator().generate( - **(self.browser_config.user_agent_generator_config or {}) + user_agent_to_override = config.user_agent + if user_agent_to_override: + self.browser_config.user_agent = user_agent_to_override + elif config.magic or config.user_agent_mode == "random": + self.browser_config.user_agent = ValidUAGenerator().generate( + **(config.user_agent_generator_config or {}) ) # Get page for session diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 4f0f42cb..91e7a31d 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -2,8 +2,146 @@ import random from typing import Optional, Literal, List, Dict, Tuple import re +from abc import ABC, abstractmethod +import random +from fake_useragent import UserAgent +import requests +from lxml import html +import json +from typing import Optional, List, Union, Dict -class UserAgentGenerator: +class UAGen(ABC): + @abstractmethod + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Union[str, Dict]: + pass + + @staticmethod + def generate_client_hints( user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + def _parse_user_agent(user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + "chrome": r"Chrome/(\d+)", + "edge": r"Edg/(\d+)", + "safari": r"Version/(\d+)", + "firefox": r"Firefox/(\d+)", + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + browsers = _parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if "chrome" in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if "edge" in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif "firefox" in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif "safari" in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ", ".join(hints) + +class ValidUAGenerator(UAGen): + def __init__(self): + self.ua = UserAgent() + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> str: + + self.ua = UserAgent( + browsers=browsers or ['Chrome', 'Firefox', 'Edge'], + os=os or ['Windows', 'Mac OS X'], + min_version=min_version, + platforms=platforms or ['desktop'], + fallback=fallback + ) + return self.ua.random + +class OnlineUAGenerator(UAGen): + def __init__(self): + self.agents = [] + self._fetch_agents() + + def _fetch_agents(self): + try: + response = requests.get( + 'https://www.useragents.me/', + timeout=5, + headers={'Accept': 'text/html,application/xhtml+xml'} + ) + response.raise_for_status() + + tree = html.fromstring(response.content) + json_text = tree.cssselect('#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea')[0].text + self.agents = json.loads(json_text) + except Exception as e: + print(f"Error fetching agents: {e}") + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Dict: + + if not self.agents: + self._fetch_agents() + + filtered_agents = self.agents + + if pct_threshold: + filtered_agents = [a for a in filtered_agents if a['pct'] >= pct_threshold] + + if browsers: + filtered_agents = [a for a in filtered_agents + if any(b.lower() in a['ua'].lower() for b in browsers)] + + if os: + os_list = [os] if isinstance(os, str) else os + filtered_agents = [a for a in filtered_agents + if any(o.lower() in a['ua'].lower() for o in os_list)] + + if platforms: + platform_list = [platforms] if isinstance(platforms, str) else platforms + filtered_agents = [a for a in filtered_agents + if any(p.lower() in a['ua'].lower() for p in platform_list)] + + return filtered_agents[0] if filtered_agents else {'ua': fallback, 'pct': 0} + + + +class UserAgentGenerator(): """ Generate random user agents with specified constraints. @@ -187,9 +325,15 @@ class UserAgentGenerator: browser_stack = self.get_browser_stack(num_browsers) # Add appropriate legacy token based on browser stack - if "Firefox" in str(browser_stack): + if "Firefox" in str(browser_stack) or browser_type == "firefox": components.append(random.choice(self.rendering_engines["gecko"])) - elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack) or browser_type == "chrome": + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Edge" in str(browser_stack) or browser_type == "edge": + components.append(self.rendering_engines["safari_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Safari" in str(browser_stack) or browser_type == "safari": components.append(self.rendering_engines["chrome_webkit"]) components.append("(KHTML, like Gecko)") @@ -273,27 +417,13 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": - generator = UserAgentGenerator() - print(generator.generate()) + + # Usage example: + generator = ValidUAGenerator() + ua = generator.generate() + print(ua) + + generator = OnlineUAGenerator() + ua = generator.generate() + print(ua) - print("\nSingle browser (Chrome):") - print(generator.generate(num_browsers=1, browser_type="chrome")) - - print("\nTwo browsers (Gecko/Firefox):") - print(generator.generate(num_browsers=2)) - - print("\nThree browsers (Chrome/Safari/Edge):") - print(generator.generate(num_browsers=3)) - - print("\nFirefox on Linux:") - print( - generator.generate( - device_type="desktop", - os_type="linux", - browser_type="firefox", - num_browsers=2, - ) - ) - - print("\nChrome/Safari/Edge on Windows:") - print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3)) From 97796f39d27f3a0f9ee62512f72dafb2e630a29e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:52:35 +0800 Subject: [PATCH 17/19] docs(examples): update proxy rotation demo and disable other demos Modify proxy rotation example to include empty user agent setting and comment out other demo functions for focused testing. This change simplifies the demo file to focus specifically on proxy rotation functionality. No breaking changes. --- docs/examples/v0_4_3b2_features_demo.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 7771c3f8..3b604c62 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -297,8 +297,7 @@ async def demo_proxy_rotation(): } except Exception as e: print(f"Error loading proxy: {e}") - return None - + return None # Create 10 test requests to httpbin urls = ["https://httpbin.org/ip"] * 2 @@ -314,7 +313,7 @@ async def demo_proxy_rotation(): continue # Create new config with proxy - current_config = run_config.clone(proxy_config=proxy) + current_config = run_config.clone(proxy_config=proxy, user_agent="") result = await crawler.arun(url=url, config=current_config) if result.success: @@ -334,18 +333,18 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - await demo_memory_dispatcher() - await demo_streaming_support() - await demo_content_scraping() + # await demo_memory_dispatcher() + # await demo_streaming_support() + # await demo_content_scraping() # # LLM Integration Demos print("\n🤖 LLM INTEGRATION DEMOS") - await demo_json_schema_generation() - await demo_llm_markdown() + # await demo_json_schema_generation() + # await demo_llm_markdown() # # Core Improvements print("\n🔧 CORE IMPROVEMENT DEMOS") - await demo_robots_compliance() + # await demo_robots_compliance() await demo_proxy_rotation() if __name__ == "__main__": From 09ac7ed008a6ef5b89c78200fa632f7494e55bfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:56:08 +0800 Subject: [PATCH 18/19] feat(demo): uncomment feature demos and add fake-useragent dependency Uncomments demonstration code for memory dispatcher, streaming support, content scraping, JSON schema generation, LLM markdown, and robots compliance in the v0.4.3b2 features demo file. Also adds fake-useragent package as a project dependency. This change makes all feature demonstrations active by default and ensures proper user agent handling capabilities. --- docs/examples/v0_4_3b2_features_demo.py | 12 ++++++------ pyproject.toml | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 3b604c62..1032f346 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -333,18 +333,18 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - # await demo_memory_dispatcher() - # await demo_streaming_support() - # await demo_content_scraping() + await demo_memory_dispatcher() + await demo_streaming_support() + await demo_content_scraping() # # LLM Integration Demos print("\n🤖 LLM INTEGRATION DEMOS") - # await demo_json_schema_generation() - # await demo_llm_markdown() + await demo_json_schema_generation() + await demo_llm_markdown() # # Core Improvements print("\n🔧 CORE IMPROVEMENT DEMOS") - # await demo_robots_compliance() + await demo_robots_compliance() await demo_proxy_rotation() if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 328438e9..38e1f89f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "rich>=13.9.4", "cssselect>=1.2.0", "httpx==0.27.2", + "fake-useragent>=2.0.3" ] classifiers = [ "Development Status :: 4 - Beta", From dde14eba7db2de240d7a1dc80f436f5c821571e8 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 26 Jan 2025 04:00:28 +0100 Subject: [PATCH 19/19] Update README.md (#562) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 8987d19d..a9fcdd19 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,6 @@ I made Crawl4AI open-source for two reasons. First, it’s my way of giving back Thank you to everyone who has supported this project, used it, and shared feedback. Your encouragement motivates me to dream even bigger. Join us, file issues, submit PRs, or spread the word. Together, we can build a tool that truly empowers people to access their own data and reshape the future of AI.
->>>>>>> vr0.4.3b2 ## 🧐 Why Crawl4AI?