diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index bd33d0b5..a4de071f 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -246,7 +246,7 @@ class CrawlerRunConfig: wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. Default: None. wait_for_images (bool): If True, wait for images to load before extracting content. - Default: True. + Default: False. delay_before_return_html (float): Delay in seconds before retrieving final HTML. Default: 0.1. mean_delay (float): Mean base delay between requests when calling arun_many. @@ -345,7 +345,7 @@ class CrawlerRunConfig: wait_until: str = "domcontentloaded", page_timeout: int = PAGE_TIMEOUT, wait_for: str = None, - wait_for_images: bool = True, + wait_for_images: bool = False, delay_before_return_html: float = 0.1, mean_delay: float = 0.1, max_range: float = 0.3, @@ -503,7 +503,7 @@ class CrawlerRunConfig: wait_until=kwargs.get("wait_until", "domcontentloaded"), page_timeout=kwargs.get("page_timeout", 60000), wait_for=kwargs.get("wait_for"), - wait_for_images=kwargs.get("wait_for_images", True), + wait_for_images=kwargs.get("wait_for_images", False), delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), mean_delay=kwargs.get("mean_delay", 0.1), max_range=kwargs.get("max_range", 0.3), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f99586a3..6ed8ec8f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -418,34 +418,30 @@ class AsyncWebCrawler: **kwargs ) - # crawl_result.status_code = async_response.status_code - # crawl_result.response_headers = async_response.response_headers - # crawl_result.downloaded_files = async_response.downloaded_files - # crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate - # else: - # crawl_result.status_code = 200 - # crawl_result.response_headers = cached_result.response_headers if cached_result else {} - # crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate # # Check and set values from async_response to crawl_result - try: - for key in vars(async_response): - if hasattr(crawl_result, key): - value = getattr(async_response, key, None) - current_value = getattr(crawl_result, key, None) - if value is not None and not current_value: - try: - setattr(crawl_result, key, value) - except Exception as e: - self.logger.warning( - message=f"Failed to set attribute {key}: {str(e)}", - tag="WARNING" - ) - except Exception as e: - self.logger.warning( - message=f"Error copying response attributes: {str(e)}", - tag="WARNING" - ) + # try: + # for key in vars(async_response): + # if hasattr(crawl_result, key): + # value = getattr(async_response, key, None) + # current_value = getattr(crawl_result, key, None) + # if value is not None and not current_value: + # try: + # setattr(crawl_result, key, value) + # except Exception as e: + # self.logger.warning( + # message=f"Failed to set attribute {key}: {str(e)}", + # tag="WARNING" + # ) + # except Exception as e: + # self.logger.warning( + # message=f"Error copying response attributes: {str(e)}", + # tag="WARNING" + # ) crawl_result.success = bool(html) crawl_result.session_id = getattr(config, 'session_id', None) @@ -585,8 +581,10 @@ class AsyncWebCrawler: # Markdown Generation markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() - if not config.content_filter and not markdown_generator.content_filter: - markdown_generator.content_filter = PruningContentFilter() + + # Uncomment if by default we want to use PruningContentFilter + # if not config.content_filter and not markdown_generator.content_filter: + # markdown_generator.content_filter = PruningContentFilter() markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 474dc9e8..89e5e34e 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -143,41 +143,83 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ - # Initialize HTML2Text with options - h = CustomHTML2Text() - if html2text_options: - h.update_params(**html2text_options) - elif options: - h.update_params(**options) - elif self.options: - h.update_params(**self.options) + try: + # Initialize HTML2Text with default options for better conversion + h = CustomHTML2Text(baseurl=base_url) + default_options = { + 'body_width': 0, # Disable text wrapping + 'ignore_emphasis': False, + 'ignore_links': False, + 'ignore_images': False, + 'protect_links': True, + 'single_line_break': True, + 'mark_code': True, + 'escape_snob': False + } + + # Update with custom options if provided + if html2text_options: + default_options.update(html2text_options) + elif options: + default_options.update(options) + elif self.options: + default_options.update(self.options) + + h.update_params(**default_options) - # Generate raw markdown - raw_markdown = h.handle(cleaned_html) - raw_markdown = raw_markdown.replace(' ```', '```') + # Ensure we have valid input + if not cleaned_html: + cleaned_html = "" + elif not isinstance(cleaned_html, str): + cleaned_html = str(cleaned_html) - # Convert links to citations - markdown_with_citations: str = "" - references_markdown: str = "" - if citations: - markdown_with_citations, references_markdown = self.convert_links_to_citations( - raw_markdown, base_url + # Generate raw markdown + try: + raw_markdown = h.handle(cleaned_html) + except Exception as e: + raw_markdown = f"Error converting HTML to markdown: {str(e)}" + + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + markdown_with_citations: str = raw_markdown + references_markdown: str = "" + if citations: + try: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + except Exception as e: + markdown_with_citations = raw_markdown + references_markdown = f"Error generating citations: {str(e)}" + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + try: + content_filter = content_filter or self.content_filter + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + except Exception as e: + fit_markdown = f"Error generating fit markdown: {str(e)}" + filtered_html = "" + + return MarkdownGenerationResult( + raw_markdown=raw_markdown or "", + markdown_with_citations=markdown_with_citations or "", + references_markdown=references_markdown or "", + fit_markdown=fit_markdown or "", + fit_html=filtered_html or "", + ) + except Exception as e: + # If anything fails, return empty strings with error message + error_msg = f"Error in markdown generation: {str(e)}" + return MarkdownGenerationResult( + raw_markdown=error_msg, + markdown_with_citations=error_msg, + references_markdown="", + fit_markdown="", + fit_html="", ) - - # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = "" - filtered_html: Optional[str] = "" - if content_filter or self.content_filter: - content_filter = content_filter or self.content_filter - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) - fit_markdown = h.handle(filtered_html) - - return MarkdownGenerationResult( - raw_markdown=raw_markdown, - markdown_with_citations=markdown_with_citations, - references_markdown=references_markdown, - fit_markdown=fit_markdown, - fit_html=filtered_html, - ) -