From a96e05d4ae9599191ea910b76c1ce58c8468d260 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 19:39:02 +0800 Subject: [PATCH 1/3] refactor(crawler): optimize response handling and default settings - Set wait_for_images default to false for better performance - Simplify response attribute copying in AsyncWebCrawler - Update hello_world example with proper content filtering --- crawl4ai/async_configs.py | 6 +- crawl4ai/async_webcrawler.py | 54 ++++++----- crawl4ai/markdown_generation_strategy.py | 112 ++++++++++++++++------- 3 files changed, 106 insertions(+), 66 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index bd33d0b5..a4de071f 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -246,7 +246,7 @@ class CrawlerRunConfig: wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. Default: None. wait_for_images (bool): If True, wait for images to load before extracting content. - Default: True. + Default: False. delay_before_return_html (float): Delay in seconds before retrieving final HTML. Default: 0.1. mean_delay (float): Mean base delay between requests when calling arun_many. @@ -345,7 +345,7 @@ class CrawlerRunConfig: wait_until: str = "domcontentloaded", page_timeout: int = PAGE_TIMEOUT, wait_for: str = None, - wait_for_images: bool = True, + wait_for_images: bool = False, delay_before_return_html: float = 0.1, mean_delay: float = 0.1, max_range: float = 0.3, @@ -503,7 +503,7 @@ class CrawlerRunConfig: wait_until=kwargs.get("wait_until", "domcontentloaded"), page_timeout=kwargs.get("page_timeout", 60000), wait_for=kwargs.get("wait_for"), - wait_for_images=kwargs.get("wait_for_images", True), + wait_for_images=kwargs.get("wait_for_images", False), delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), mean_delay=kwargs.get("mean_delay", 0.1), max_range=kwargs.get("max_range", 0.3), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f99586a3..6ed8ec8f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -418,34 +418,30 @@ class AsyncWebCrawler: **kwargs ) - # crawl_result.status_code = async_response.status_code - # crawl_result.response_headers = async_response.response_headers - # crawl_result.downloaded_files = async_response.downloaded_files - # crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate - # else: - # crawl_result.status_code = 200 - # crawl_result.response_headers = cached_result.response_headers if cached_result else {} - # crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate # # Check and set values from async_response to crawl_result - try: - for key in vars(async_response): - if hasattr(crawl_result, key): - value = getattr(async_response, key, None) - current_value = getattr(crawl_result, key, None) - if value is not None and not current_value: - try: - setattr(crawl_result, key, value) - except Exception as e: - self.logger.warning( - message=f"Failed to set attribute {key}: {str(e)}", - tag="WARNING" - ) - except Exception as e: - self.logger.warning( - message=f"Error copying response attributes: {str(e)}", - tag="WARNING" - ) + # try: + # for key in vars(async_response): + # if hasattr(crawl_result, key): + # value = getattr(async_response, key, None) + # current_value = getattr(crawl_result, key, None) + # if value is not None and not current_value: + # try: + # setattr(crawl_result, key, value) + # except Exception as e: + # self.logger.warning( + # message=f"Failed to set attribute {key}: {str(e)}", + # tag="WARNING" + # ) + # except Exception as e: + # self.logger.warning( + # message=f"Error copying response attributes: {str(e)}", + # tag="WARNING" + # ) crawl_result.success = bool(html) crawl_result.session_id = getattr(config, 'session_id', None) @@ -585,8 +581,10 @@ class AsyncWebCrawler: # Markdown Generation markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() - if not config.content_filter and not markdown_generator.content_filter: - markdown_generator.content_filter = PruningContentFilter() + + # Uncomment if by default we want to use PruningContentFilter + # if not config.content_filter and not markdown_generator.content_filter: + # markdown_generator.content_filter = PruningContentFilter() markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 474dc9e8..89e5e34e 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -143,41 +143,83 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ - # Initialize HTML2Text with options - h = CustomHTML2Text() - if html2text_options: - h.update_params(**html2text_options) - elif options: - h.update_params(**options) - elif self.options: - h.update_params(**self.options) + try: + # Initialize HTML2Text with default options for better conversion + h = CustomHTML2Text(baseurl=base_url) + default_options = { + 'body_width': 0, # Disable text wrapping + 'ignore_emphasis': False, + 'ignore_links': False, + 'ignore_images': False, + 'protect_links': True, + 'single_line_break': True, + 'mark_code': True, + 'escape_snob': False + } + + # Update with custom options if provided + if html2text_options: + default_options.update(html2text_options) + elif options: + default_options.update(options) + elif self.options: + default_options.update(self.options) + + h.update_params(**default_options) - # Generate raw markdown - raw_markdown = h.handle(cleaned_html) - raw_markdown = raw_markdown.replace(' ```', '```') + # Ensure we have valid input + if not cleaned_html: + cleaned_html = "" + elif not isinstance(cleaned_html, str): + cleaned_html = str(cleaned_html) - # Convert links to citations - markdown_with_citations: str = "" - references_markdown: str = "" - if citations: - markdown_with_citations, references_markdown = self.convert_links_to_citations( - raw_markdown, base_url + # Generate raw markdown + try: + raw_markdown = h.handle(cleaned_html) + except Exception as e: + raw_markdown = f"Error converting HTML to markdown: {str(e)}" + + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + markdown_with_citations: str = raw_markdown + references_markdown: str = "" + if citations: + try: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + except Exception as e: + markdown_with_citations = raw_markdown + references_markdown = f"Error generating citations: {str(e)}" + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + try: + content_filter = content_filter or self.content_filter + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + except Exception as e: + fit_markdown = f"Error generating fit markdown: {str(e)}" + filtered_html = "" + + return MarkdownGenerationResult( + raw_markdown=raw_markdown or "", + markdown_with_citations=markdown_with_citations or "", + references_markdown=references_markdown or "", + fit_markdown=fit_markdown or "", + fit_html=filtered_html or "", + ) + except Exception as e: + # If anything fails, return empty strings with error message + error_msg = f"Error in markdown generation: {str(e)}" + return MarkdownGenerationResult( + raw_markdown=error_msg, + markdown_with_citations=error_msg, + references_markdown="", + fit_markdown="", + fit_html="", ) - - # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = "" - filtered_html: Optional[str] = "" - if content_filter or self.content_filter: - content_filter = content_filter or self.content_filter - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) - fit_markdown = h.handle(filtered_html) - - return MarkdownGenerationResult( - raw_markdown=raw_markdown, - markdown_with_citations=markdown_with_citations, - references_markdown=references_markdown, - fit_markdown=fit_markdown, - fit_html=filtered_html, - ) - From aa4f92f4587c4cc5757ef00ebf71fd3515633c8b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 19:39:42 +0800 Subject: [PATCH 2/3] refactor(crawler): - Update hello_world example with proper content filtering --- docs/examples/hello_world.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 docs/examples/hello_world.py diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py new file mode 100644 index 00000000..bcdb0d71 --- /dev/null +++ b/docs/examples/hello_world.py @@ -0,0 +1,25 @@ +import os, sys + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) + +import asyncio +from crawl4ai import * + +async def main(): + async with AsyncWebCrawler() as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + ) + ) + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + print(result.markdown_v2.raw_markdown[:500]) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From da1bc0f7bf952d85bdf35f76e2b8a8a13eb11d00 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 19:42:35 +0800 Subject: [PATCH 3/3] Update version file --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 7f7cf687..3f798c0c 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.245" +__version__ = "0.4.246"