build: streamline package discovery and bump to v0.4.244

- Replace explicit package listing with setuptools.find - Include all crawl4ai.* packages automatically - Use `packages = {find = {where = ["."], include = ["crawl4ai*"]}}` syntax - Bump version to 0.4.244 This change simplifies package maintenance by automatically discovering all subpackages under crawl4ai namespace instead of listing them manually.
2025-01-01 17:53:51 +08:00
6 changed files with 82 additions and 145 deletions
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 [![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/)
 [![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai)

-<!-- [![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/) -->
+[![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/)
 [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
@@ -20,9 +20,9 @@

 Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.  

-[✨ Check out latest update v0.4.24x](#-recent-updates)
+[✨ Check out latest update v0.4.24](#-recent-updates)

-🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
+🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)

 ## 🧐 Why Crawl4AI?

@@ -38,7 +38,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
 1. Install Crawl4AI:
 ```bash
 # Install the package
-pip install -U crawl4ai
+pip install crawl4ai

 # Run post-installation setup
 crawl4ai-setup
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.246"
+__version__ = "0.4.244"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -35,9 +35,7 @@ class BrowserConfig:
        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
                                     temporary directory may be used. Default: None.
        chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
-                              is "chromium". Default: "chromium".
-        channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
-                              is "chromium". Default: "chromium".
+                              is "chromium". Default: "chrome".
        proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
                             Default: None.
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
@@ -79,8 +77,7 @@ class BrowserConfig:
        use_managed_browser: bool = False,
        use_persistent_context: bool = False,
        user_data_dir: str = None,
-        chrome_channel: str = "chromium",
-        channel: str = "chromium",
+        chrome_channel: str = "chrome",
        proxy: str = None,
        proxy_config: dict = None,
        viewport_width: int = 1080,
@@ -110,8 +107,14 @@ class BrowserConfig:
        self.use_managed_browser = use_managed_browser
        self.use_persistent_context = use_persistent_context
        self.user_data_dir = user_data_dir
-        self.chrome_channel = chrome_channel or self.browser_type or "chromium"
-        self.channel = channel or self.browser_type or "chromium"
+        if self.browser_type == "chromium":
+            self.chrome_channel = "chrome"
+        elif self.browser_type == "firefox":
+            self.chrome_channel = "firefox"
+        elif self.browser_type == "webkit":
+            self.chrome_channel = "webkit"
+        else:
+            self.chrome_channel = chrome_channel or "chrome"
        self.proxy = proxy
        self.proxy_config = proxy_config
        self.viewport_width = viewport_width
@@ -158,8 +161,7 @@ class BrowserConfig:
            use_managed_browser=kwargs.get("use_managed_browser", False),
            use_persistent_context=kwargs.get("use_persistent_context", False),
            user_data_dir=kwargs.get("user_data_dir"),
-            chrome_channel=kwargs.get("chrome_channel", "chromium"),
-            channel=kwargs.get("channel", "chromium"),
+            chrome_channel=kwargs.get("chrome_channel", "chrome"),
            proxy=kwargs.get("proxy"),
            proxy_config=kwargs.get("proxy_config"),
            viewport_width=kwargs.get("viewport_width", 1080),
@@ -246,7 +248,7 @@ class CrawlerRunConfig:
        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
                                Default: None.
        wait_for_images (bool): If True, wait for images to load before extracting content.
-                                Default: False.
+                                Default: True.
        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
                                          Default: 0.1.
        mean_delay (float): Mean base delay between requests when calling arun_many.
@@ -345,7 +347,7 @@ class CrawlerRunConfig:
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
        wait_for: str = None,
-        wait_for_images: bool = False,
+        wait_for_images: bool = True,
        delay_before_return_html: float = 0.1,
        mean_delay: float = 0.1,
        max_range: float = 0.3,
@@ -503,7 +505,7 @@ class CrawlerRunConfig:
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
            wait_for=kwargs.get("wait_for"),
-            wait_for_images=kwargs.get("wait_for_images", False),
+            wait_for_images=kwargs.get("wait_for_images", True),
            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
            mean_delay=kwargs.get("mean_delay", 0.1),
            max_range=kwargs.get("max_range", 0.3),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -418,30 +418,34 @@ class AsyncWebCrawler:
                            **kwargs
                        )

-                        crawl_result.status_code = async_response.status_code
-                        crawl_result.response_headers = async_response.response_headers
-                        crawl_result.downloaded_files = async_response.downloaded_files
-                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
+                    #     crawl_result.status_code = async_response.status_code
+                    #     crawl_result.response_headers = async_response.response_headers
+                    #     crawl_result.downloaded_files = async_response.downloaded_files
+                    #     crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
+                    # else:
+                    #     crawl_result.status_code = 200
+                    #     crawl_result.response_headers = cached_result.response_headers if cached_result else {}
+                    #     crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None  # Add SSL certificate from cache

                        # # Check and set values from async_response to crawl_result
-                        # try:
-                        #     for key in vars(async_response):
-                        #         if hasattr(crawl_result, key):
-                        #             value = getattr(async_response, key, None)
-                        #             current_value = getattr(crawl_result, key, None)
-                        #             if value is not None and not current_value:
-                        #                 try:
-                        #                     setattr(crawl_result, key, value)
-                        #                 except Exception as e:
-                        #                     self.logger.warning(
-                        #                         message=f"Failed to set attribute {key}: {str(e)}",
-                        #                         tag="WARNING"
-                        #                     )
-                        # except Exception as e:
-                        #     self.logger.warning(
-                        #         message=f"Error copying response attributes: {str(e)}",
-                        #         tag="WARNING"
-                        #     )
+                        try:
+                            for key in vars(async_response):
+                                if hasattr(crawl_result, key):
+                                    value = getattr(async_response, key, None)
+                                    current_value = getattr(crawl_result, key, None)
+                                    if value is not None and not current_value:
+                                        try:
+                                            setattr(crawl_result, key, value)
+                                        except Exception as e:
+                                            self.logger.warning(
+                                                message=f"Failed to set attribute {key}: {str(e)}",
+                                                tag="WARNING"
+                                            )
+                        except Exception as e:
+                            self.logger.warning(
+                                message=f"Error copying response attributes: {str(e)}",
+                                tag="WARNING"
+                            )

                        crawl_result.success = bool(html)
                        crawl_result.session_id = getattr(config, 'session_id', None)
@@ -581,10 +585,8 @@ class AsyncWebCrawler:

            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
-            
-            # Uncomment if by default we want to use PruningContentFilter
-            # if not config.content_filter and not markdown_generator.content_filter:
-            #     markdown_generator.content_filter = PruningContentFilter()
+            if not config.content_filter and not markdown_generator.content_filter:
+                markdown_generator.content_filter = PruningContentFilter()
            
            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                cleaned_html=cleaned_html,
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -143,83 +143,41 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        """
-        try:
-            # Initialize HTML2Text with default options for better conversion
-            h = CustomHTML2Text(baseurl=base_url)
-            default_options = {
-                'body_width': 0,  # Disable text wrapping
-                'ignore_emphasis': False,
-                'ignore_links': False,
-                'ignore_images': False,
-                'protect_links': True,
-                'single_line_break': True,
-                'mark_code': True,
-                'escape_snob': False
-            }
-            
-            # Update with custom options if provided
-            if html2text_options:
-                default_options.update(html2text_options)
-            elif options:
-                default_options.update(options)
-            elif self.options:
-                default_options.update(self.options)
-            
-            h.update_params(**default_options)
+        # Initialize HTML2Text with options
+        h = CustomHTML2Text()
+        if html2text_options:
+            h.update_params(**html2text_options)
+        elif options:
+            h.update_params(**options)
+        elif self.options:
+            h.update_params(**self.options)

-            # Ensure we have valid input
-            if not cleaned_html:
-                cleaned_html = ""
-            elif not isinstance(cleaned_html, str):
-                cleaned_html = str(cleaned_html)
+        # Generate raw markdown
+        raw_markdown = h.handle(cleaned_html)
+        raw_markdown = raw_markdown.replace('    ```', '```')

-            # Generate raw markdown
-            try:
-                raw_markdown = h.handle(cleaned_html)
-            except Exception as e:
-                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
-            
-            raw_markdown = raw_markdown.replace('    ```', '```')
-
-            # Convert links to citations
-            markdown_with_citations: str = raw_markdown
-            references_markdown: str = ""
-            if citations:
-                try:
-                    markdown_with_citations, references_markdown = self.convert_links_to_citations(
-                        raw_markdown, base_url
-                    )
-                except Exception as e:
-                    markdown_with_citations = raw_markdown
-                    references_markdown = f"Error generating citations: {str(e)}"
-
-            # Generate fit markdown if content filter is provided
-            fit_markdown: Optional[str] = ""
-            filtered_html: Optional[str] = ""
-            if content_filter or self.content_filter:
-                try:
-                    content_filter = content_filter or self.content_filter
-                    filtered_html = content_filter.filter_content(cleaned_html)
-                    filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
-                    fit_markdown = h.handle(filtered_html)
-                except Exception as e:
-                    fit_markdown = f"Error generating fit markdown: {str(e)}"
-                    filtered_html = ""
-
-            return MarkdownGenerationResult(
-                raw_markdown=raw_markdown or "",
-                markdown_with_citations=markdown_with_citations or "",
-                references_markdown=references_markdown or "",
-                fit_markdown=fit_markdown or "",
-                fit_html=filtered_html or "",
-            )
-        except Exception as e:
-            # If anything fails, return empty strings with error message
-            error_msg = f"Error in markdown generation: {str(e)}"
-            return MarkdownGenerationResult(
-                raw_markdown=error_msg,
-                markdown_with_citations=error_msg,
-                references_markdown="",
-                fit_markdown="",
-                fit_html="",
+        # Convert links to citations
+        markdown_with_citations: str = ""
+        references_markdown: str = ""
+        if citations:
+            markdown_with_citations, references_markdown = self.convert_links_to_citations(
+                raw_markdown, base_url
            )
+
+        # Generate fit markdown if content filter is provided
+        fit_markdown: Optional[str] = ""
+        filtered_html: Optional[str] = ""
+        if content_filter or self.content_filter:
+            content_filter = content_filter or self.content_filter
+            filtered_html = content_filter.filter_content(cleaned_html)
+            filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
+            fit_markdown = h.handle(filtered_html)
+
+        return MarkdownGenerationResult(
+            raw_markdown=raw_markdown,
+            markdown_with_citations=markdown_with_citations,
+            references_markdown=references_markdown,
+            fit_markdown=fit_markdown,
+            fit_html=filtered_html,
+        )
+
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,25 +0,0 @@
-import os, sys
-
-sys.path.append(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-
-import asyncio
-from crawl4ai import *
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        crawler_config = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
-            )
-        )
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
-        )
-        print(result.markdown_v2.raw_markdown[:500])
-
-if __name__ == "__main__":
-    asyncio.run(main())