Main Article
+Important content paragraph with useful link.
+Key Section
+Detailed explanation with multiple sentences. This should be kept + in the final output. Very important information here.
+diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 78ccdb02..bcb6b3ef 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -23,6 +23,7 @@ from .async_dispatcher import ( RateLimiter, CrawlerMonitor, DisplayMode, + BaseDispatcher ) __all__ = [ @@ -43,6 +44,7 @@ __all__ = [ "DefaultMarkdownGenerator", "PruningContentFilter", "BM25ContentFilter", + "BaseDispatcher", "MemoryAdaptiveDispatcher", "SemaphoreDispatcher", "RateLimiter", diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 669ddec2..ca5e6ef2 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -14,8 +14,8 @@ from .async_logger import AsyncLogger from .utils import get_error_context, create_box_message # Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) base_directory = DB_PATH = os.path.join( os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai" @@ -333,7 +333,11 @@ class AsyncDatabaseManager: json.loads(row_dict[field]) if row_dict[field] else {} ) except json.JSONDecodeError: - row_dict[field] = {} + # Very UGLY, never mention it to me please + if field == "markdown" and isinstance(row_dict[field], str): + row_dict[field] = row_dict[field] + else: + row_dict[field] = {} if isinstance(row_dict["markdown"], Dict): row_dict["markdown_v2"] = row_dict["markdown"] diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 9ab2389b..217aced4 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -140,21 +140,21 @@ class AsyncCrawlResponse(BaseModel): # Scraping Models ############################### class MediaItem(BaseModel): - src: str - alt: Optional[str] = None - desc: Optional[str] = None - score: int + src: Optional[str] = "" + alt: Optional[str] = "" + desc: Optional[str] = "" + score: Optional[int] = 0 type: str = "image" - group_id: int + group_id: Optional[int] = 0 format: Optional[str] = None width: Optional[int] = None class Link(BaseModel): - href: str - text: str - title: Optional[str] = None - base_domain: str + href: Optional[str] = "" + text: Optional[str] = "" + title: Optional[str] = "" + base_domain: Optional[str] = "" class Media(BaseModel): diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/20241401/test_async_crawler_strategy.py new file mode 100644 index 00000000..68fe4a88 --- /dev/null +++ b/tests/20241401/test_async_crawler_strategy.py @@ -0,0 +1,343 @@ +import pytest +import pytest_asyncio +import asyncio +from typing import Dict, Any +from pathlib import Path +from unittest.mock import MagicMock, patch +import os +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy +from crawl4ai.models import AsyncCrawlResponse +from crawl4ai.async_logger import AsyncLogger, LogLevel + +CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai") + +if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists(): + CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True) + +# Test Config Files +@pytest.fixture +def basic_browser_config(): + return BrowserConfig( + browser_type="chromium", + headless=True, + verbose=True + ) + +@pytest.fixture +def advanced_browser_config(): + return BrowserConfig( + browser_type="chromium", + headless=True, + use_managed_browser=True, + user_data_dir=CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile"), + # proxy="http://localhost:8080", + viewport_width=1920, + viewport_height=1080, + user_agent_mode="random" + ) + +@pytest.fixture +def basic_crawler_config(): + return CrawlerRunConfig( + word_count_threshold=100, + wait_until="domcontentloaded", + page_timeout=30000 + ) + +@pytest.fixture +def logger(): + return AsyncLogger(verbose=True, log_level=LogLevel.DEBUG) + +@pytest_asyncio.fixture +async def crawler_strategy(basic_browser_config, logger): + strategy = AsyncPlaywrightCrawlerStrategy(browser_config=basic_browser_config, logger=logger) + await strategy.start() + yield strategy + await strategy.close() + +# Browser Configuration Tests +@pytest.mark.asyncio +async def test_browser_config_initialization(): + config = BrowserConfig( + browser_type="chromium", + user_agent_mode="random" + ) + assert config.browser_type == "chromium" + assert config.user_agent is not None + assert config.headless is True + +@pytest.mark.asyncio +async def test_persistent_browser_config(): + config = BrowserConfig( + use_persistent_context=True, + user_data_dir="/tmp/test_dir" + ) + assert config.use_managed_browser is True + assert config.user_data_dir == "/tmp/test_dir" + +# Crawler Strategy Tests +@pytest.mark.asyncio +async def test_basic_page_load(crawler_strategy): + response = await crawler_strategy.crawl( + "https://example.com", + CrawlerRunConfig() + ) + assert response.status_code == 200 + assert len(response.html) > 0 + assert "Example Domain" in response.html + +@pytest.mark.asyncio +async def test_screenshot_capture(crawler_strategy): + config = CrawlerRunConfig(screenshot=True) + response = await crawler_strategy.crawl( + "https://example.com", + config + ) + assert response.screenshot is not None + assert len(response.screenshot) > 0 + +@pytest.mark.asyncio +async def test_pdf_generation(crawler_strategy): + config = CrawlerRunConfig(pdf=True) + response = await crawler_strategy.crawl( + "https://example.com", + config + ) + assert response.pdf_data is not None + assert len(response.pdf_data) > 0 + +@pytest.mark.asyncio +async def test_handle_js_execution(crawler_strategy): + config = CrawlerRunConfig( + js_code="document.body.style.backgroundColor = 'red';" + ) + response = await crawler_strategy.crawl( + "https://example.com", + config + ) + assert response.status_code == 200 + assert 'background-color: red' in response.html.lower() + +@pytest.mark.asyncio +async def test_multiple_js_commands(crawler_strategy): + js_commands = [ + "document.body.style.backgroundColor = 'blue';", + "document.title = 'Modified Title';", + "const div = document.createElement('div'); div.id = 'test'; div.textContent = 'Test Content'; document.body.appendChild(div);" + ] + config = CrawlerRunConfig(js_code=js_commands) + response = await crawler_strategy.crawl( + "https://example.com", + config + ) + assert response.status_code == 200 + assert 'background-color: blue' in response.html.lower() + assert 'id="test"' in response.html + assert '>Test Content<' in response.html + assert '
This is a test paragraph with a link.
+More content here with bold text.
+Important content paragraph with useful link.
+Detailed explanation with multiple sentences. This should be kept + in the final output. Very important information here.
++ +
def test(): pass
+ First link to Example 1
+Second link to Test 2
+Image link: 
Repeated link to Example 1 again
+ + """, +} + +def test_content_filters() -> Dict[str, Dict[str, int]]: + """Test various content filtering strategies and return length comparisons.""" + results = {} + + # Initialize filters + pruning_filter = PruningContentFilter( + threshold=0.48, + threshold_type="fixed", + min_word_threshold=2 + ) + + bm25_filter = BM25ContentFilter( + bm25_threshold=1.0, + user_query="test article content important" + ) + + # Test each HTML sample + for test_name, html in TEST_HTML_SAMPLES.items(): + # Store results for this test case + results[test_name] = {} + + # Test PruningContentFilter + start_time = time.time() + pruned_content = pruning_filter.filter_content(html) + pruning_time = time.time() - start_time + + # Test BM25ContentFilter + start_time = time.time() + bm25_content = bm25_filter.filter_content(html) + bm25_time = time.time() - start_time + + # Store results + results[test_name] = { + "original_length": len(html), + "pruned_length": sum(len(c) for c in pruned_content), + "bm25_length": sum(len(c) for c in bm25_content), + "pruning_time": pruning_time, + "bm25_time": bm25_time + } + + return results + +def test_markdown_generation(): + """Test markdown generation with different configurations.""" + results = [] + + # Initialize generators with different configurations + generators = { + "no_filter": DefaultMarkdownGenerator(), + "pruning": DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48) + ), + "bm25": DefaultMarkdownGenerator( + content_filter=BM25ContentFilter( + user_query="test article content important" + ) + ) + } + + # Test each generator with each HTML sample + for test_name, html in TEST_HTML_SAMPLES.items(): + for gen_name, generator in generators.items(): + start_time = time.time() + result = generator.generate_markdown( + html, + base_url="http://example.com", + citations=True + ) + + results.append({ + "test_case": test_name, + "generator": gen_name, + "time": time.time() - start_time, + "raw_length": len(result.raw_markdown), + "fit_length": len(result.fit_markdown) if result.fit_markdown else 0, + "citations": len(result.references_markdown) + }) + + return results + +def main(): + """Run all tests and print results.""" + print("Starting content filter tests...") + filter_results = test_content_filters() + + print("\nContent Filter Results:") + print("-" * 50) + for test_name, metrics in filter_results.items(): + print(f"\nTest case: {test_name}") + print(f"Original length: {metrics['original_length']}") + print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)") + print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)") + + print("\nStarting markdown generation tests...") + markdown_results = test_markdown_generation() + + print("\nMarkdown Generation Results:") + print("-" * 50) + for result in markdown_results: + print(f"\nTest: {result['test_case']} - Generator: {result['generator']}") + print(f"Time: {result['time']:.3f}s") + print(f"Raw length: {result['raw_length']}") + print(f"Fit length: {result['fit_length']}") + print(f"Citations: {result['citations']}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/20241401/test_async_webcrawler.py b/tests/20241401/test_async_webcrawler.py new file mode 100644 index 00000000..4d7aa815 --- /dev/null +++ b/tests/20241401/test_async_webcrawler.py @@ -0,0 +1,149 @@ +import asyncio +import pytest +from typing import List +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + MemoryAdaptiveDispatcher, + RateLimiter, + CacheMode +) + +@pytest.mark.asyncio +@pytest.mark.parametrize("viewport", [ + (800, 600), + (1024, 768), + (1920, 1080) +]) +async def test_viewport_config(viewport): + """Test different viewport configurations""" + width, height = viewport + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=width, + viewport_height=height + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig( + # cache_mode=CacheMode.BYPASS, + page_timeout=30000 # 30 seconds + ) + ) + assert result.success + +@pytest.mark.asyncio +async def test_memory_management(): + """Test memory-adaptive dispatching""" + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=1024, + viewport_height=768 + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=70.0, + check_interval=1.0, + max_session_permit=5 + ) + + urls = ["https://example.com"] * 3 # Test with multiple identical URLs + + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many( + urls=urls, + config=CrawlerRunConfig(page_timeout=30000), + dispatcher=dispatcher + ) + assert len(results) == len(urls) + +@pytest.mark.asyncio +async def test_rate_limiting(): + """Test rate limiting functionality""" + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + dispatcher = MemoryAdaptiveDispatcher( + rate_limiter=RateLimiter( + base_delay=(1.0, 2.0), + max_delay=5.0, + max_retries=2 + ), + memory_threshold_percent=70.0 + ) + + urls = [ + "https://example.com", + "https://example.org", + "https://example.net" + ] + + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many( + urls=urls, + config=CrawlerRunConfig(page_timeout=30000), + dispatcher=dispatcher + ) + assert len(results) == len(urls) + +@pytest.mark.asyncio +async def test_javascript_execution(): + """Test JavaScript execution capabilities""" + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + java_script_enabled=True + ) + + js_code = """ + document.body.style.backgroundColor = 'red'; + return document.body.style.backgroundColor; + """ + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig( + js_code=js_code, + page_timeout=30000 + ) + ) + assert result.success + +@pytest.mark.asyncio +@pytest.mark.parametrize("error_url", [ + "https://invalid.domain.test", + "https://httpbin.org/status/404", + "https://httpbin.org/status/503", + "https://httpbin.org/status/403" +]) +async def test_error_handling(error_url): + """Test error handling for various failure scenarios""" + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url=error_url, + config=CrawlerRunConfig( + page_timeout=10000, # Short timeout for error cases + cache_mode=CacheMode.BYPASS + ) + ) + assert not result.success + assert result.error_message is not None + +if __name__ == "__main__": + asyncio.run(test_viewport_config((1024, 768))) + asyncio.run(test_memory_management()) + asyncio.run(test_rate_limiting()) + asyncio.run(test_javascript_execution()) \ No newline at end of file