* Fix: Use correct URL variable for raw HTML extraction (#1116) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html * Fix #1181: Preserve whitespace in code blocks during HTML scraping The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant. * Refactor Pydantic model configuration to use ConfigDict for arbitrary types * Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621 * Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638 * fix: ensure BrowserConfig.to_dict serializes proxy_config * feat: make LLM backoff configurable end-to-end - extend LLMConfig with backoff delay/attempt/factor fields and thread them through LLMExtractionStrategy, LLMContentFilter, table extraction, and Docker API handlers - expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff and document them in the md_v2 guides * reproduced AttributeError from #1642 * pass timeout parameter to docker client request * added missing deep crawling objects to init * generalized query in ContentRelevanceFilter to be a str or list * import modules from enhanceable deserialization * parameterized tests * Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268 * refactor: replace PyPDF2 with pypdf across the codebase. ref #1412 * announcement: add application form for cloud API closed beta * Release v0.7.8: Stability & Bug Fix Release - Updated version to 0.7.8 - Introduced focused stability release addressing 11 community-reported bugs. - Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates. - Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended. - Updated documentation to reflect recent changes and improvements. * docs: add section for Crawl4AI Cloud API closed beta with application link * fix: add disk cleanup step to Docker workflow --------- Co-authored-by: rbushria <rbushri@gmail.com> Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com> Co-authored-by: Soham Kukreti <kukretisoham@gmail.com> Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com> Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
230 lines
5.8 KiB
Python
230 lines
5.8 KiB
Python
# __init__.py
|
|
import warnings
|
|
|
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
|
|
|
from .content_scraping_strategy import (
|
|
ContentScrapingStrategy,
|
|
LXMLWebScrapingStrategy,
|
|
WebScrapingStrategy, # Backward compatibility alias
|
|
)
|
|
from .async_logger import (
|
|
AsyncLoggerBase,
|
|
AsyncLogger,
|
|
)
|
|
from .proxy_strategy import (
|
|
ProxyRotationStrategy,
|
|
RoundRobinProxyStrategy,
|
|
)
|
|
from .extraction_strategy import (
|
|
ExtractionStrategy,
|
|
LLMExtractionStrategy,
|
|
CosineStrategy,
|
|
JsonCssExtractionStrategy,
|
|
JsonXPathExtractionStrategy,
|
|
JsonLxmlExtractionStrategy,
|
|
RegexExtractionStrategy
|
|
)
|
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from .table_extraction import (
|
|
TableExtractionStrategy,
|
|
DefaultTableExtraction,
|
|
NoTableExtraction,
|
|
LLMTableExtraction,
|
|
)
|
|
from .content_filter_strategy import (
|
|
PruningContentFilter,
|
|
BM25ContentFilter,
|
|
LLMContentFilter,
|
|
RelevantContentFilter,
|
|
)
|
|
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
|
from .components.crawler_monitor import CrawlerMonitor
|
|
from .link_preview import LinkPreview
|
|
from .async_dispatcher import (
|
|
MemoryAdaptiveDispatcher,
|
|
SemaphoreDispatcher,
|
|
RateLimiter,
|
|
BaseDispatcher,
|
|
)
|
|
from .docker_client import Crawl4aiDockerClient
|
|
from .hub import CrawlerHub
|
|
from .browser_profiler import BrowserProfiler
|
|
from .deep_crawling import (
|
|
DeepCrawlStrategy,
|
|
BFSDeepCrawlStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
DomainFilter,
|
|
ContentTypeFilter,
|
|
URLFilter,
|
|
FilterStats,
|
|
SEOFilter,
|
|
KeywordRelevanceScorer,
|
|
URLScorer,
|
|
CompositeScorer,
|
|
DomainAuthorityScorer,
|
|
FreshnessScorer,
|
|
PathDepthScorer,
|
|
BestFirstCrawlingStrategy,
|
|
DFSDeepCrawlStrategy,
|
|
DeepCrawlDecorator,
|
|
ContentRelevanceFilter,
|
|
ContentTypeScorer,
|
|
)
|
|
# NEW: Import AsyncUrlSeeder
|
|
from .async_url_seeder import AsyncUrlSeeder
|
|
# Adaptive Crawler
|
|
from .adaptive_crawler import (
|
|
AdaptiveCrawler,
|
|
AdaptiveConfig,
|
|
CrawlState,
|
|
CrawlStrategy,
|
|
StatisticalStrategy
|
|
)
|
|
|
|
# C4A Script Language Support
|
|
from .script import (
|
|
compile as c4a_compile,
|
|
validate as c4a_validate,
|
|
compile_file as c4a_compile_file,
|
|
CompilationResult,
|
|
ValidationResult,
|
|
ErrorDetail
|
|
)
|
|
|
|
# Browser Adapters
|
|
from .browser_adapter import (
|
|
BrowserAdapter,
|
|
PlaywrightAdapter,
|
|
UndetectedAdapter
|
|
)
|
|
|
|
from .utils import (
|
|
start_colab_display_server,
|
|
setup_colab_environment,
|
|
hooks_to_string
|
|
)
|
|
|
|
__all__ = [
|
|
"AsyncLoggerBase",
|
|
"AsyncLogger",
|
|
"AsyncWebCrawler",
|
|
"BrowserProfiler",
|
|
"LLMConfig",
|
|
"GeolocationConfig",
|
|
# NEW: Add SeedingConfig and VirtualScrollConfig
|
|
"SeedingConfig",
|
|
"VirtualScrollConfig",
|
|
# NEW: Add AsyncUrlSeeder
|
|
"AsyncUrlSeeder",
|
|
# Adaptive Crawler
|
|
"AdaptiveCrawler",
|
|
"AdaptiveConfig",
|
|
"CrawlState",
|
|
"CrawlStrategy",
|
|
"StatisticalStrategy",
|
|
"DeepCrawlStrategy",
|
|
"BFSDeepCrawlStrategy",
|
|
"BestFirstCrawlingStrategy",
|
|
"DFSDeepCrawlStrategy",
|
|
"FilterChain",
|
|
"URLPatternFilter",
|
|
"ContentTypeFilter",
|
|
"DomainFilter",
|
|
"FilterStats",
|
|
"URLFilter",
|
|
"SEOFilter",
|
|
"KeywordRelevanceScorer",
|
|
"URLScorer",
|
|
"CompositeScorer",
|
|
"DomainAuthorityScorer",
|
|
"FreshnessScorer",
|
|
"PathDepthScorer",
|
|
"DeepCrawlDecorator",
|
|
"CrawlResult",
|
|
"CrawlerHub",
|
|
"CacheMode",
|
|
"MatchMode",
|
|
"ContentScrapingStrategy",
|
|
"WebScrapingStrategy",
|
|
"LXMLWebScrapingStrategy",
|
|
"BrowserConfig",
|
|
"CrawlerRunConfig",
|
|
"HTTPCrawlerConfig",
|
|
"ExtractionStrategy",
|
|
"LLMExtractionStrategy",
|
|
"CosineStrategy",
|
|
"JsonCssExtractionStrategy",
|
|
"JsonXPathExtractionStrategy",
|
|
"JsonLxmlExtractionStrategy",
|
|
"RegexExtractionStrategy",
|
|
"ChunkingStrategy",
|
|
"RegexChunking",
|
|
"DefaultMarkdownGenerator",
|
|
"TableExtractionStrategy",
|
|
"DefaultTableExtraction",
|
|
"NoTableExtraction",
|
|
"RelevantContentFilter",
|
|
"PruningContentFilter",
|
|
"BM25ContentFilter",
|
|
"LLMContentFilter",
|
|
"BaseDispatcher",
|
|
"MemoryAdaptiveDispatcher",
|
|
"SemaphoreDispatcher",
|
|
"RateLimiter",
|
|
"CrawlerMonitor",
|
|
"LinkPreview",
|
|
"DisplayMode",
|
|
"MarkdownGenerationResult",
|
|
"Crawl4aiDockerClient",
|
|
"ProxyRotationStrategy",
|
|
"RoundRobinProxyStrategy",
|
|
"ProxyConfig",
|
|
"start_colab_display_server",
|
|
"setup_colab_environment",
|
|
"hooks_to_string",
|
|
# C4A Script additions
|
|
"c4a_compile",
|
|
"c4a_validate",
|
|
"c4a_compile_file",
|
|
"CompilationResult",
|
|
"ValidationResult",
|
|
"ErrorDetail",
|
|
# Browser Adapters
|
|
"BrowserAdapter",
|
|
"PlaywrightAdapter",
|
|
"UndetectedAdapter",
|
|
"LinkPreviewConfig"
|
|
]
|
|
|
|
|
|
# def is_sync_version_installed():
|
|
# try:
|
|
# import selenium # noqa
|
|
|
|
# return True
|
|
# except ImportError:
|
|
# return False
|
|
|
|
|
|
# if is_sync_version_installed():
|
|
# try:
|
|
# from .web_crawler import WebCrawler
|
|
|
|
# __all__.append("WebCrawler")
|
|
# except ImportError:
|
|
# print(
|
|
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
|
# )
|
|
# else:
|
|
# WebCrawler = None
|
|
# # import warnings
|
|
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
|
|
|
# Disable all Pydantic warnings
|
|
warnings.filterwarnings("ignore", module="pydantic")
|
|
# pydantic_warnings.filter_warnings() |