From 24723b2f100ed25747b1b84a833f82e17340b457 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:45:05 +0800 Subject: [PATCH] Enhance features and documentation - Updated version to 0.3.743 - Improved ManagedBrowser configuration with dynamic host/port - Implemented fast HTML formatting in web crawler - Enhanced markdown generation with a new generator class - Improved sanitization and utility functions - Added contributor details and pull request acknowledgments - Updated documentation for clearer usage scenarios - Adjusted tests to reflect class name changes --- CONTRIBUTORS.md | 8 +++ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 24 +++++---- crawl4ai/async_webcrawler.py | 12 +++-- crawl4ai/content_scraping_strategy.py | 19 ++++--- crawl4ai/markdown_generation_strategy.py | 14 ++++-- crawl4ai/utils.py | 64 +++++++++++++++++++++--- docs/md_v2/advanced/hooks-auth.md | 8 ++- tests/async/test_markdown_genertor.py | 14 +++--- 9 files changed, 123 insertions(+), 42 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..deb46a9c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,11 +10,19 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for - [datehoer](https://github.com/datehoer) - Add browser prxy support +## Pull Requests + +- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) +- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) +- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) + + ## Other Contributors - [Gokhan](https://github.com/gkhngyk) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..37e3c08a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.743" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f332eb0..882f9a50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,14 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None - self.debugging_port = 9222 + self.debugging_port = debugging_port + self.host = host self.logger = logger self.shutting_down = False @@ -70,7 +71,7 @@ class ManagedBrowser: # Monitor browser process output for errors asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" + return f"http://{self.host}:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") @@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError(f"Invalid hook type: {hook_type}") - async def execute_hook(self, hook_type: str, *args): + async def execute_hook(self, hook_type: str, *args, **kwargs): hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): - return await hook(*args) + return await hook(*args, **kwargs) else: - return hook(*args) + return hook(*args, **kwargs) return args[0] if args else None def update_user_agent(self, user_agent: str): @@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): session_id = kwargs.get("session_id") # Handle page creation differently for managed browser + context = None if self.use_managed_browser: if session_id: # Reuse existing session if available @@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) + await self.execute_hook('before_goto', page, context = context) response = await page.goto( @@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") - await self.execute_hook('after_goto', page) + await self.execute_hook('after_goto', page, context = context) # Get status code and headers status_code = response.status @@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page) + await self.execute_hook('on_execution_started', page, context = context) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page) + await self.execute_hook('before_retrieve_html', page, context = context) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: @@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html, context = context) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b8be6f35..5a46fe39 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -25,7 +25,8 @@ from .config import ( from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, - format_html + format_html, + fast_format_html ) from urllib.parse import urlparse import random @@ -534,16 +535,17 @@ class AsyncWebCrawler: "timing": time.perf_counter() - t1 } ) - - - screenshot = None if not screenshot else screenshot + + if kwargs.get("prettiify", False): + cleaned_html = fast_format_html(cleaned_html) + return CrawlResult( url=url, html=html, - cleaned_html=format_html(cleaned_html), + cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ea6a2ef8..ec6c3361 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter -from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, @@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: Dict containing markdown content in various formats """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: try: + if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: + markdown_generator.content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, - html2text_options=kwargs.get('html2text', {}), - content_filter=kwargs.get('content_filter', None) + html2text_options=kwargs.get('html2text', {}) ) + help_message = """""" + return { 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_markdown': markdown_result.fit_markdown, + 'fit_html': markdown_result.fit_html, 'markdown_v2': markdown_result } except Exception as e: diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..b1e43f9d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + self.content_filter = content_filter @abstractmethod def generate_markdown(self, @@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC): """Generate markdown from cleaned HTML.""" pass -class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): +class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + super().__init__(content_filter) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url ) # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = None - if content_filter: + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b07562df..aaf27e91 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -233,12 +233,17 @@ def sanitize_html(html): def sanitize_input_encode(text: str) -> str: """Sanitize input to handle potential encoding issues.""" try: - # Attempt to encode and decode as UTF-8 to handle potential encoding issues - return text.encode('utf-8', errors='ignore').decode('utf-8') - except UnicodeEncodeError as e: - print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") - # Fall back to ASCII if UTF-8 fails - return text.encode('ascii', errors='ignore').decode('ascii') + try: + if not text: + return '' + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + except Exception as e: + raise ValueError(f"Error sanitizing input: {str(e)}") from e def escape_json_string(s): """ @@ -1079,9 +1084,54 @@ def wrap_text(draw, text, font, max_width): return '\n'.join(lines) def format_html(html_string): - soup = BeautifulSoup(html_string, 'html.parser') + soup = BeautifulSoup(html_string, 'lxml.parser') return soup.prettify() +def fast_format_html(html_string): + """ + A fast HTML formatter that uses string operations instead of parsing. + + Args: + html_string (str): The HTML string to format + + Returns: + str: The formatted HTML string + """ + # Initialize variables + indent = 0 + indent_str = " " # Two spaces for indentation + formatted = [] + in_content = False + + # Split by < and > to separate tags and content + parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n') + + for part in parts: + if not part.strip(): + continue + + # Handle closing tags + if part.startswith(''): + formatted.append(indent_str * indent + part) + + # Handle opening tags + elif part.startswith('<'): + formatted.append(indent_str * indent + part) + indent += 1 + + # Handle content between tags + else: + content = part.strip() + if content: + formatted.append(indent_str * indent + content) + + return '\n'.join(formatted) + def normalize_url(href, base_url): """Normalize URLs to ensure consistent format""" from urllib.parse import urljoin, urlparse diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index e4b7d7ce..8da3a1cc 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -18,7 +18,7 @@ Let's see how we can customize the AsyncWebCrawler using hooks! In this example, import asyncio from crawl4ai import AsyncWebCrawler from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy -from playwright.async_api import Page, Browser +from playwright.async_api import Page, Browser, BrowserContext async def on_browser_created(browser: Browser): print("[HOOK] on_browser_created") @@ -71,7 +71,11 @@ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy async def main(): print("\nšŸ”— Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!") - crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True) + initial_cookies = [ + {"name": "sessionId", "value": "abc123", "domain": ".example.com"}, + {"name": "userId", "value": "12345", "domain": ".example.com"} + ] + crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies) crawler_strategy.set_hook('on_browser_created', on_browser_created) crawler_strategy.set_hook('before_goto', before_goto) crawler_strategy.set_hook('after_goto', after_goto) diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py index 025a0318..2b1102ab 100644 --- a/tests/async/test_markdown_genertor.py +++ b/tests/async/test_markdown_genertor.py @@ -11,7 +11,7 @@ import asyncio import os import time from typing import Dict, Any -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # Get current directory __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -41,7 +41,7 @@ def test_basic_markdown_conversion(): with open(__location__ + "/data/wikipedia.html", "r") as f: cleaned_html = f.read() - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() start_time = time.perf_counter() result = generator.generate_markdown( @@ -70,7 +70,7 @@ def test_relative_links(): Also an [image](/images/test.png) and another [page](/wiki/Banana). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://en.wikipedia.org" @@ -86,7 +86,7 @@ def test_duplicate_links(): Here's a [link](/test) and another [link](/test) and a [different link](/other). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -102,7 +102,7 @@ def test_link_descriptions(): Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -120,7 +120,7 @@ def test_performance_large_document(): iterations = 5 times = [] - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() for i in range(iterations): start_time = time.perf_counter() @@ -144,7 +144,7 @@ def test_image_links(): And a regular [link](/page). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com"