From 2d31915f0a0b8f1e5cecfaff0514423c20b6daeb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 9 Dec 2024 20:04:59 +0800 Subject: [PATCH] Commit Message: Enhance Async Crawler with storage state handling - Updated Async Crawler to support storage state management. - Added error handling for URL validation in Async Web Crawler. - Modified README logo and improved .gitignore entries. - Fixed issues in multiple files for better code robustness. --- .gitignore | 5 ++- README.md | 2 +- crawl4ai/async_crawler_strategy.py | 54 ++++++++++++++++++++++++------ crawl4ai/async_webcrawler.py | 7 +++- crawl4ai/extraction_strategy.py | 2 +- crawl4ai/utils.py | 1 + main.py | 2 +- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 52e25a2a..02c75b3f 100644 --- a/.gitignore +++ b/.gitignore @@ -214,4 +214,7 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh -publish.sh \ No newline at end of file +publish.sh + +combine.sh +combined_output.txt \ No newline at end of file diff --git a/README.md b/README.md index dede4a03..095c595c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI. +# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. unclecode%2Fcrawl4ai | Trendshift diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 5c706239..fca0c0ec 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -238,8 +238,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = kwargs.get( "user_agent", - # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" - "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) user_agenr_generator = UserAgentGenerator() if kwargs.get("user_agent_mode") == "random": @@ -254,6 +254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) self.cookies = kwargs.get("cookies", []) + self.storage_state = kwargs.get("storage_state", None) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -315,7 +316,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # If no default context exists, create one self.default_context = await self.browser.new_context( # viewport={"width": 1920, "height": 1080} - viewport={"width": self.viewport_width, "height": self.viewport_height} + viewport={"width": self.viewport_width, "height": self.viewport_height}, + storage_state=self.storage_state, ) # Set up the default context @@ -323,6 +325,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.default_context.set_extra_http_headers(self.headers) if self.cookies: await self.default_context.add_cookies(self.cookies) + if self.storage_state: + # If storage_state is a dictionary or file path, Playwright will handle it. + await self.default_context.storage_state(path=None) # Just ensuring default_context is ready if self.accept_downloads: await self.default_context.set_default_timeout(60000) await self.default_context.set_default_navigation_timeout(60000) @@ -426,6 +431,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.default_context = self.browser else: self.browser = await self.playwright.chromium.launch(**browser_args) + self.default_context = self.browser except Exception as e: # Fallback to chromium if Chrome channel fails @@ -643,6 +649,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, ignore_https_errors=True ) @@ -771,6 +778,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) @@ -792,6 +800,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, ignore_https_errors=True # Add this line ) if self.cookies: @@ -862,7 +871,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page, context = context) + await self.execute_hook('before_goto', page, context = context, **kwargs) try: response = await page.goto( @@ -874,7 +883,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") - await self.execute_hook('after_goto', page, context = context) + await self.execute_hook('after_goto', page, context = context, **kwargs) # Get status code and headers status_code = response.status @@ -929,9 +938,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # CONTENT LOADING ASSURANCE if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): # Wait for network idle after initial load and images to load - await page.wait_for_load_state("networkidle") + # await page.wait_for_load_state("networkidle") + await page.wait_for_load_state("domcontentloaded") await asyncio.sleep(0.1) - await page.wait_for_function("Array.from(document.images).every(img => img.complete)") + from playwright.async_api import TimeoutError as PlaywrightTimeoutError + try: + await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) + # Check for TimeoutError and ignore it + except PlaywrightTimeoutError: + pass # After initial load, adjust viewport to content size if not self.text_only and kwargs.get("adjust_viewport_to_content", False): @@ -1015,7 +1030,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page, context = context) + await self.execute_hook('on_execution_started', page, context = context, **kwargs) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -1119,7 +1134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page, context = context) + await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) if delay_before_return_html: @@ -1130,7 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html, context = context) + await self.execute_hook('before_return_html', page, html, context = context, **kwargs) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None @@ -1394,6 +1409,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: await page.close() + + async def export_storage_state(self, path: str = None) -> dict: + """ + Exports the current storage state (cookies, localStorage, sessionStorage) + to a JSON file at the specified path. + """ + if self.default_context: + state = await self.default_context.storage_state(path=path) + self.logger.info( + message="Exported storage state to {path}", + tag="INFO", + params={"path": path} + ) + return state + else: + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING" + ) async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: """ diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 2c17602d..b872c20c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -182,6 +182,10 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ + # Check if url is not string and is not empty + if not isinstance(url, str) or not url: + raise ValueError("Invalid URL, make sure the URL is a non-empty string") + async with self._lock or nullcontext(): try: # Handle deprecated parameters @@ -335,7 +339,8 @@ class AsyncWebCrawler: # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") self.logger.error_status( - url=cache_context.display_url, + # url=cache_context.display_url, + url=url, error=create_box_message(e.msg, type = "error"), tag="ERROR" ) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index b79e0c43..a778bf4d 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -632,7 +632,7 @@ class ContentSummarizationStrategy(ExtractionStrategy): # Sort summaries by the original section index to maintain order summaries.sort(key=lambda x: x[0]) return [summary for _, summary in summaries] - + class JsonCssExtractionStrategy(ExtractionStrategy): def __init__(self, schema: Dict[str, Any], **kwargs): super().__init__(**kwargs) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 0a9e6f56..879ba562 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -147,6 +147,7 @@ class CustomHTML2Text(HTML2Text): # self.preserved_content.append(data) # return # super().handle_data(data, entity_char) + class InvalidCSSSelectorError(Exception): pass diff --git a/main.py b/main.py index d6c792e8..21d3de16 100644 --- a/main.py +++ b/main.py @@ -342,7 +342,7 @@ app.add_middleware( # API token security security = HTTPBearer() -CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): if not CRAWL4AI_API_TOKEN: