Files
crawl4ai/crawl4ai/__init__.py
UncleCode 9b5ccac76e feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types:
- Built-in patterns for emails, URLs, phones, dates, and more
- Support for custom regex patterns
- LLM-assisted pattern generation utility
- Optimized HTML preprocessing with fit_html field
- Enhanced network response body capture

Breaking changes: None
2025-05-02 21:15:24 +08:00

157 lines
4.0 KiB
Python

# __init__.py
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
from .content_scraping_strategy import (
ContentScrapingStrategy,
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .extraction_strategy import (
ExtractionStrategy,
LLMExtractionStrategy,
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
RelevantContentFilter,
)
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
RateLimiter,
BaseDispatcher,
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .browser_profiler import BrowserProfiler
from .deep_crawling import (
DeepCrawlStrategy,
BFSDeepCrawlStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
URLFilter,
FilterStats,
SEOFilter,
KeywordRelevanceScorer,
URLScorer,
CompositeScorer,
DomainAuthorityScorer,
FreshnessScorer,
PathDepthScorer,
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
)
__all__ = [
"AsyncLoggerBase",
"AsyncLogger",
"AsyncWebCrawler",
"BrowserProfiler",
"LLMConfig",
"GeolocationConfig",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FilterChain",
"URLPatternFilter",
"ContentTypeFilter",
"DomainFilter",
"FilterStats",
"URLFilter",
"SEOFilter",
"KeywordRelevanceScorer",
"URLScorer",
"CompositeScorer",
"DomainAuthorityScorer",
"FreshnessScorer",
"PathDepthScorer",
"DeepCrawlDecorator",
"CrawlResult",
"CrawlerHub",
"CacheMode",
"ContentScrapingStrategy",
"WebScrapingStrategy",
"LXMLWebScrapingStrategy",
"BrowserConfig",
"CrawlerRunConfig",
"HTTPCrawlerConfig",
"ExtractionStrategy",
"LLMExtractionStrategy",
"CosineStrategy",
"JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
"RelevantContentFilter",
"PruningContentFilter",
"BM25ContentFilter",
"LLMContentFilter",
"BaseDispatcher",
"MemoryAdaptiveDispatcher",
"SemaphoreDispatcher",
"RateLimiter",
"CrawlerMonitor",
"DisplayMode",
"MarkdownGenerationResult",
"Crawl4aiDockerClient",
"ProxyRotationStrategy",
"RoundRobinProxyStrategy",
"ProxyConfig"
]
# def is_sync_version_installed():
# try:
# import selenium # noqa
# return True
# except ImportError:
# return False
# if is_sync_version_installed():
# try:
# from .web_crawler import WebCrawler
# __all__.append("WebCrawler")
# except ImportError:
# print(
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
# )
# else:
# WebCrawler = None
# # import warnings
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# Disable all Pydantic warnings
warnings.filterwarnings("ignore", module="pydantic")
# pydantic_warnings.filter_warnings()