diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 189a2955..6f8b06f4 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.747" +__version__ = "0.4.0" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3d24bd84..493597ea 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont from pathlib import Path @@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) user_agenr_generator = UserAgentGenerator() @@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ + try: - await page.wait_for_load_state() + try: + await page.wait_for_load_state( + # state="load", + state="domcontentloaded", + timeout=5 + ) + except PlaywrightTimeoutError: + pass await page.evaluate(update_image_dimensions_js) except Exception as e: - raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + self.logger.error( + message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + tag="ERROR", + params={"error": str(e)} + ) + # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete await page.wait_for_timeout(100) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index f242054d..1e0ca664 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): self.content_filter = content_filter + self.options = options or {} @abstractmethod def generate_markdown(self, @@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC): class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): - super().__init__(content_filter) + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): + super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: @@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): h = CustomHTML2Text() if html2text_options: h.update_params(**html2text_options) + elif options: + h.update_params(**options) + elif self.options: + h.update_params(**self.options) # Generate raw markdown raw_markdown = h.handle(cleaned_html) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 0a4df0bb..a1f3a49e 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -236,6 +236,7 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": generator = UserAgentGenerator() + print(generator.generate()) print("\nSingle browser (Chrome):") print(generator.generate(num_browsers=1, browser_type='chrome')) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 73d695c3..176b0ba7 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -547,19 +547,50 @@ async def generate_knowledge_graph(): f.write(result.extracted_content) async def fit_markdown_remove_overlay(): - async with AsyncWebCrawler(headless = False) as crawler: - url = "https://janineintheworld.com/places-to-visit-in-central-mexico" + async with AsyncWebCrawler( + headless=True, # Set to False to see what is happening + verbose=True, + user_agent_mode="random", + user_agent_generator_config={ + "device_type": "mobile", + "os_type": "android" + }, + ) as crawler: result = await crawler.arun( - url=url, + url='https://www.kidocode.com/degrees/technology', cache_mode=CacheMode.BYPASS, - word_count_threshold = 10, - remove_overlay_elements=True, - screenshot = True + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0), + options={ + "ignore_links": True + } + ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0), + # options={ + # "ignore_links": True + # } + # ), ) - # Save markdown to file - with open(os.path.join(__location__, "mexico_places.md"), "w") as f: - f.write(result.fit_markdown) - + + if result.success: + print(len(result.markdown_v2.raw_markdown)) + print(len(result.markdown_v2.markdown_with_citations)) + print(len(result.markdown_v2.fit_markdown)) + + # Save clean html + with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f: + f.write(result.cleaned_html) + + with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + print("Done")