Updated to version 0.4.0 with new features

- Enhanced error handling in async crawler. - Added flexible options in Markdown generation. - Updated user agent settings for improved reliability. - Reflected changes in documentation and examples.
2024-12-04 20:26:39 +08:00
parent b02544bc0b
commit 486db3a771
5 changed files with 69 additions and 16 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.747"
+__version__ = "0.4.0"
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os, sys, shutil
 import tempfile, subprocess
 from playwright.async_api import async_playwright, Page, Browser, Error
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
 from pathlib import Path
@@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        self.use_cached_html = use_cached_html
        self.user_agent = kwargs.get(
            "user_agent",
+            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
        )
        user_agenr_generator = UserAgentGenerator()
@@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                });
            }
            """
+            
            try:
-                await page.wait_for_load_state()
+                try:
+                    await page.wait_for_load_state(
+                        # state="load",
+                        state="domcontentloaded",
+                        timeout=5
+                    )
+                except PlaywrightTimeoutError:
+                    pass
                await page.evaluate(update_image_dimensions_js)
            except Exception as e:
-                raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
+                self.logger.error(
+                    message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+                # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")

            # Wait a bit for any onload events to complete
            await page.wait_for_timeout(100)
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

 class MarkdownGenerationStrategy(ABC):
    """Abstract base class for markdown generation strategies."""
-    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        self.content_filter = content_filter
+        self.options = options or {}
    
    @abstractmethod
    def generate_markdown(self, 
@@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC):

 class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
    """Default implementation of markdown generation strategy."""
-    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
-        super().__init__(content_filter)
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
+        super().__init__(content_filter, options)
    
    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
        link_map = {}
@@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                         cleaned_html: str, 
                         base_url: str = "",
                         html2text_options: Optional[Dict[str, Any]] = None,
+                         options: Optional[Dict[str, Any]] = None,
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
@@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
        h = CustomHTML2Text()
        if html2text_options:
            h.update_params(**html2text_options)
+        elif options:
+            h.update_params(**options)
+        elif self.options:
+            h.update_params(**self.options)

        # Generate raw markdown
        raw_markdown = h.handle(cleaned_html)
--- a/crawl4ai/user_agent_generator.py
+++ b/crawl4ai/user_agent_generator.py
@@ -236,6 +236,7 @@ class UserAgentGenerator:
 # Example usage:
 if __name__ == "__main__":
    generator = UserAgentGenerator()
+    print(generator.generate())
    
    print("\nSingle browser (Chrome):")
    print(generator.generate(num_browsers=1, browser_type='chrome'))
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -547,19 +547,50 @@ async def generate_knowledge_graph():
            f.write(result.extracted_content)

 async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(headless = False) as crawler:
-        url = "https://janineintheworld.com/places-to-visit-in-central-mexico"
+    async with AsyncWebCrawler(
+            headless=True,  # Set to False to see what is happening
+            verbose=True,
+            user_agent_mode="random",
+            user_agent_generator_config={
+                "device_type": "mobile",
+                "os_type": "android"
+            },
+    ) as crawler:
        result = await crawler.arun(
-            url=url,
+            url='https://www.kidocode.com/degrees/technology',
            cache_mode=CacheMode.BYPASS,
-            word_count_threshold = 10,
-            remove_overlay_elements=True,
-            screenshot = True
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
+                options={
+                    "ignore_links": True
+                }
+            ),
+            # markdown_generator=DefaultMarkdownGenerator(
+            #     content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0),
+            #     options={
+            #         "ignore_links": True
+            #     }
+            # ),
        )
-        # Save markdown to file
-        with open(os.path.join(__location__, "mexico_places.md"), "w") as f:
-            f.write(result.fit_markdown)
-
+        
+        if result.success:
+            print(len(result.markdown_v2.raw_markdown))
+            print(len(result.markdown_v2.markdown_with_citations))
+            print(len(result.markdown_v2.fit_markdown))
+            
+            # Save clean html
+            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
+                f.write(result.cleaned_html)
+            
+            with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+                f.write(result.markdown_v2.raw_markdown)
+                
+            with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
+                f.write(result.markdown_v2.markdown_with_citations) 
+                
+            with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:   
+                f.write(result.markdown_v2.fit_markdown)
+        
    print("Done")