diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..deb46a9c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,11 +10,19 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for - [datehoer](https://github.com/datehoer) - Add browser prxy support +## Pull Requests + +- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) +- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) +- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) + + ## Other Contributors - [Gokhan](https://github.com/gkhngyk) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..37e3c08a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.743" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f332eb0..882f9a50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,14 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None - self.debugging_port = 9222 + self.debugging_port = debugging_port + self.host = host self.logger = logger self.shutting_down = False @@ -70,7 +71,7 @@ class ManagedBrowser: # Monitor browser process output for errors asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" + return f"http://{self.host}:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") @@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError(f"Invalid hook type: {hook_type}") - async def execute_hook(self, hook_type: str, *args): + async def execute_hook(self, hook_type: str, *args, **kwargs): hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): - return await hook(*args) + return await hook(*args, **kwargs) else: - return hook(*args) + return hook(*args, **kwargs) return args[0] if args else None def update_user_agent(self, user_agent: str): @@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): session_id = kwargs.get("session_id") # Handle page creation differently for managed browser + context = None if self.use_managed_browser: if session_id: # Reuse existing session if available @@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) + await self.execute_hook('before_goto', page, context = context) response = await page.goto( @@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") - await self.execute_hook('after_goto', page) + await self.execute_hook('after_goto', page, context = context) # Get status code and headers status_code = response.status @@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page) + await self.execute_hook('on_execution_started', page, context = context) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page) + await self.execute_hook('before_retrieve_html', page, context = context) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: @@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html, context = context) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b8be6f35..5a46fe39 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -25,7 +25,8 @@ from .config import ( from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, - format_html + format_html, + fast_format_html ) from urllib.parse import urlparse import random @@ -534,16 +535,17 @@ class AsyncWebCrawler: "timing": time.perf_counter() - t1 } ) - - - screenshot = None if not screenshot else screenshot + + if kwargs.get("prettiify", False): + cleaned_html = fast_format_html(cleaned_html) + return CrawlResult( url=url, html=html, - cleaned_html=format_html(cleaned_html), + cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ea6a2ef8..ec6c3361 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter -from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, @@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: Dict containing markdown content in various formats """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: try: + if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: + markdown_generator.content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, - html2text_options=kwargs.get('html2text', {}), - content_filter=kwargs.get('content_filter', None) + html2text_options=kwargs.get('html2text', {}) ) + help_message = """""" + return { 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_markdown': markdown_result.fit_markdown, + 'fit_html': markdown_result.fit_html, 'markdown_v2': markdown_result } except Exception as e: diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..b1e43f9d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + self.content_filter = content_filter @abstractmethod def generate_markdown(self, @@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC): """Generate markdown from cleaned HTML.""" pass -class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): +class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + super().__init__(content_filter) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url ) # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = None - if content_filter: + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('