Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include: - Interactive profile creation and management - Profile listing, retrieval, and deletion - Guided console interface - Migration of profile management from ManagedBrowser - New example script for identity-based browsing ALSO: - Updates logging format in AsyncWebCrawler - Removes content filter from hello_world example - Relaxes httpx version constraint BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler
148 lines
3.7 KiB
Python
148 lines
3.7 KiB
Python
# __init__.py
|
|
import warnings
|
|
|
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
|
from .content_scraping_strategy import (
|
|
ContentScrapingStrategy,
|
|
WebScrapingStrategy,
|
|
LXMLWebScrapingStrategy,
|
|
)
|
|
from .async_logger import (
|
|
AsyncLoggerBase,
|
|
AsyncLogger,
|
|
)
|
|
from .proxy_strategy import (
|
|
ProxyRotationStrategy,
|
|
RoundRobinProxyStrategy,
|
|
)
|
|
from .extraction_strategy import (
|
|
ExtractionStrategy,
|
|
LLMExtractionStrategy,
|
|
CosineStrategy,
|
|
JsonCssExtractionStrategy,
|
|
JsonXPathExtractionStrategy,
|
|
)
|
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from .content_filter_strategy import (
|
|
PruningContentFilter,
|
|
BM25ContentFilter,
|
|
LLMContentFilter,
|
|
RelevantContentFilter,
|
|
)
|
|
from .models import CrawlResult, MarkdownGenerationResult
|
|
from .async_dispatcher import (
|
|
MemoryAdaptiveDispatcher,
|
|
SemaphoreDispatcher,
|
|
RateLimiter,
|
|
CrawlerMonitor,
|
|
DisplayMode,
|
|
BaseDispatcher,
|
|
)
|
|
from .docker_client import Crawl4aiDockerClient
|
|
from .hub import CrawlerHub
|
|
from .browser_profiler import BrowserProfiler
|
|
from .deep_crawling import (
|
|
DeepCrawlStrategy,
|
|
BFSDeepCrawlStrategy,
|
|
FilterChain,
|
|
ContentTypeFilter,
|
|
DomainFilter,
|
|
URLFilter,
|
|
FilterStats,
|
|
SEOFilter,
|
|
KeywordRelevanceScorer,
|
|
URLScorer,
|
|
CompositeScorer,
|
|
DomainAuthorityScorer,
|
|
FreshnessScorer,
|
|
PathDepthScorer,
|
|
BestFirstCrawlingStrategy,
|
|
DFSDeepCrawlStrategy,
|
|
DeepCrawlDecorator,
|
|
)
|
|
|
|
__all__ = [
|
|
"AsyncLoggerBase",
|
|
"AsyncLogger",
|
|
"AsyncWebCrawler",
|
|
"BrowserProfiler",
|
|
"DeepCrawlStrategy",
|
|
"BFSDeepCrawlStrategy",
|
|
"BestFirstCrawlingStrategy",
|
|
"DFSDeepCrawlStrategy",
|
|
"FilterChain",
|
|
"ContentTypeFilter",
|
|
"DomainFilter",
|
|
"FilterStats",
|
|
"URLFilter",
|
|
"SEOFilter",
|
|
"KeywordRelevanceScorer",
|
|
"URLScorer",
|
|
"CompositeScorer",
|
|
"DomainAuthorityScorer",
|
|
"FreshnessScorer",
|
|
"PathDepthScorer",
|
|
"DeepCrawlDecorator",
|
|
"CrawlResult",
|
|
"CrawlerHub",
|
|
"CacheMode",
|
|
"ContentScrapingStrategy",
|
|
"WebScrapingStrategy",
|
|
"LXMLWebScrapingStrategy",
|
|
"BrowserConfig",
|
|
"CrawlerRunConfig",
|
|
"HTTPCrawlerConfig",
|
|
"ExtractionStrategy",
|
|
"LLMExtractionStrategy",
|
|
"CosineStrategy",
|
|
"JsonCssExtractionStrategy",
|
|
"JsonXPathExtractionStrategy",
|
|
"ChunkingStrategy",
|
|
"RegexChunking",
|
|
"DefaultMarkdownGenerator",
|
|
"RelevantContentFilter",
|
|
"PruningContentFilter",
|
|
"BM25ContentFilter",
|
|
"LLMContentFilter",
|
|
"BaseDispatcher",
|
|
"MemoryAdaptiveDispatcher",
|
|
"SemaphoreDispatcher",
|
|
"RateLimiter",
|
|
"CrawlerMonitor",
|
|
"DisplayMode",
|
|
"MarkdownGenerationResult",
|
|
"Crawl4aiDockerClient",
|
|
"ProxyRotationStrategy",
|
|
"RoundRobinProxyStrategy",
|
|
]
|
|
|
|
|
|
# def is_sync_version_installed():
|
|
# try:
|
|
# import selenium # noqa
|
|
|
|
# return True
|
|
# except ImportError:
|
|
# return False
|
|
|
|
|
|
# if is_sync_version_installed():
|
|
# try:
|
|
# from .web_crawler import WebCrawler
|
|
|
|
# __all__.append("WebCrawler")
|
|
# except ImportError:
|
|
# print(
|
|
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
|
# )
|
|
# else:
|
|
# WebCrawler = None
|
|
# # import warnings
|
|
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
|
|
|
# Disable all Pydantic warnings
|
|
warnings.filterwarnings("ignore", module="pydantic")
|
|
# pydantic_warnings.filter_warnings()
|