Files
crawl4ai/crawl4ai/__init__.py
UncleCode f3ae5a657c feat(scraping): add LXML-based scraping mode for improved performance
Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing.
LXML mode offers 10-20x better performance for large HTML documents.

Key changes:
- Added ScrapingMode enum with BEAUTIFULSOUP and LXML options
- Implemented LXMLWebScrapingStrategy class
- Added LXML-based metadata extraction
- Updated documentation with scraping mode usage and performance considerations
- Added cssselect dependency

BREAKING CHANGE: None
2025-01-12 20:46:23 +08:00

54 lines
1.8 KiB
Python

# __init__.py
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
from .models import CrawlResult, MarkdownGenerationResult
from .async_dispatcher import MemoryAdaptiveDispatcher, SemaphoreDispatcher, RateLimiter, CrawlerMonitor, DisplayMode
from .__version__ import __version__
__all__ = [
"AsyncWebCrawler",
"CrawlResult",
"CacheMode",
"ScrapingMode",
'BrowserConfig',
'CrawlerRunConfig',
'ExtractionStrategy',
'LLMExtractionStrategy',
'CosineStrategy',
'JsonCssExtractionStrategy',
'ChunkingStrategy',
'RegexChunking',
'DefaultMarkdownGenerator',
'PruningContentFilter',
'BM25ContentFilter',
'MemoryAdaptiveDispatcher',
'SemaphoreDispatcher',
'RateLimiter',
'CrawlerMonitor',
'DisplayMode',
'MarkdownGenerationResult',
]
def is_sync_version_installed():
try:
import selenium
return True
except ImportError:
return False
if is_sync_version_installed():
try:
from .web_crawler import WebCrawler
__all__.append("WebCrawler")
except ImportError:
import warnings
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
else:
WebCrawler = None
# import warnings
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")