Commit Message:
Enhance Async Crawler with storage state handling - Updated Async Crawler to support storage state management. - Added error handling for URL validation in Async Web Crawler. - Modified README logo and improved .gitignore entries. - Fixed issues in multiple files for better code robustness.
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -214,4 +214,7 @@ git_issues.md
|
|||||||
todo_executor.md
|
todo_executor.md
|
||||||
protect-all-except-feature.sh
|
protect-all-except-feature.sh
|
||||||
manage-collab.sh
|
manage-collab.sh
|
||||||
publish.sh
|
publish.sh
|
||||||
|
|
||||||
|
combine.sh
|
||||||
|
combined_output.txt
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
|
# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
|
||||||
|
|
||||||
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
|
|||||||
@@ -238,8 +238,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
self.user_agent = kwargs.get(
|
self.user_agent = kwargs.get(
|
||||||
"user_agent",
|
"user_agent",
|
||||||
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||||
"Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
|
# "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
|
||||||
)
|
)
|
||||||
user_agenr_generator = UserAgentGenerator()
|
user_agenr_generator = UserAgentGenerator()
|
||||||
if kwargs.get("user_agent_mode") == "random":
|
if kwargs.get("user_agent_mode") == "random":
|
||||||
@@ -254,6 +254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
||||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||||
self.cookies = kwargs.get("cookies", [])
|
self.cookies = kwargs.get("cookies", [])
|
||||||
|
self.storage_state = kwargs.get("storage_state", None)
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
self.session_ttl = 1800
|
self.session_ttl = 1800
|
||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
@@ -315,7 +316,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# If no default context exists, create one
|
# If no default context exists, create one
|
||||||
self.default_context = await self.browser.new_context(
|
self.default_context = await self.browser.new_context(
|
||||||
# viewport={"width": 1920, "height": 1080}
|
# viewport={"width": 1920, "height": 1080}
|
||||||
viewport={"width": self.viewport_width, "height": self.viewport_height}
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
||||||
|
storage_state=self.storage_state,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set up the default context
|
# Set up the default context
|
||||||
@@ -323,6 +325,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.default_context.set_extra_http_headers(self.headers)
|
await self.default_context.set_extra_http_headers(self.headers)
|
||||||
if self.cookies:
|
if self.cookies:
|
||||||
await self.default_context.add_cookies(self.cookies)
|
await self.default_context.add_cookies(self.cookies)
|
||||||
|
if self.storage_state:
|
||||||
|
# If storage_state is a dictionary or file path, Playwright will handle it.
|
||||||
|
await self.default_context.storage_state(path=None) # Just ensuring default_context is ready
|
||||||
if self.accept_downloads:
|
if self.accept_downloads:
|
||||||
await self.default_context.set_default_timeout(60000)
|
await self.default_context.set_default_timeout(60000)
|
||||||
await self.default_context.set_default_navigation_timeout(60000)
|
await self.default_context.set_default_navigation_timeout(60000)
|
||||||
@@ -426,6 +431,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.default_context = self.browser
|
self.default_context = self.browser
|
||||||
else:
|
else:
|
||||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||||
|
self.default_context = self.browser
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback to chromium if Chrome channel fails
|
# Fallback to chromium if Chrome channel fails
|
||||||
@@ -643,6 +649,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
||||||
proxy={"server": self.proxy} if self.proxy else None,
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
accept_downloads=self.accept_downloads,
|
accept_downloads=self.accept_downloads,
|
||||||
|
storage_state=self.storage_state,
|
||||||
ignore_https_errors=True
|
ignore_https_errors=True
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -771,6 +778,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
proxy={"server": self.proxy} if self.proxy else None,
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
java_script_enabled=True,
|
java_script_enabled=True,
|
||||||
accept_downloads=self.accept_downloads,
|
accept_downloads=self.accept_downloads,
|
||||||
|
storage_state=self.storage_state,
|
||||||
# downloads_path=self.downloads_path if self.accept_downloads else None
|
# downloads_path=self.downloads_path if self.accept_downloads else None
|
||||||
)
|
)
|
||||||
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||||
@@ -792,6 +800,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
||||||
proxy={"server": self.proxy} if self.proxy else None,
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
accept_downloads=self.accept_downloads,
|
accept_downloads=self.accept_downloads,
|
||||||
|
storage_state=self.storage_state,
|
||||||
ignore_https_errors=True # Add this line
|
ignore_https_errors=True # Add this line
|
||||||
)
|
)
|
||||||
if self.cookies:
|
if self.cookies:
|
||||||
@@ -862,7 +871,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
if not kwargs.get("js_only", False):
|
if not kwargs.get("js_only", False):
|
||||||
await self.execute_hook('before_goto', page, context = context)
|
await self.execute_hook('before_goto', page, context = context, **kwargs)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await page.goto(
|
response = await page.goto(
|
||||||
@@ -874,7 +883,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error as e:
|
except Error as e:
|
||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
|
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
|
||||||
|
|
||||||
await self.execute_hook('after_goto', page, context = context)
|
await self.execute_hook('after_goto', page, context = context, **kwargs)
|
||||||
|
|
||||||
# Get status code and headers
|
# Get status code and headers
|
||||||
status_code = response.status
|
status_code = response.status
|
||||||
@@ -929,9 +938,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# CONTENT LOADING ASSURANCE
|
# CONTENT LOADING ASSURANCE
|
||||||
if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
|
if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
|
||||||
# Wait for network idle after initial load and images to load
|
# Wait for network idle after initial load and images to load
|
||||||
await page.wait_for_load_state("networkidle")
|
# await page.wait_for_load_state("networkidle")
|
||||||
|
await page.wait_for_load_state("domcontentloaded")
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
await page.wait_for_function("Array.from(document.images).every(img => img.complete)")
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||||
|
try:
|
||||||
|
await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000)
|
||||||
|
# Check for TimeoutError and ignore it
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
pass
|
||||||
|
|
||||||
# After initial load, adjust viewport to content size
|
# After initial load, adjust viewport to content size
|
||||||
if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
|
if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
|
||||||
@@ -1015,7 +1030,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# await page.wait_for_timeout(100)
|
# await page.wait_for_timeout(100)
|
||||||
|
|
||||||
# Check for on execution event
|
# Check for on execution event
|
||||||
await self.execute_hook('on_execution_started', page, context = context)
|
await self.execute_hook('on_execution_started', page, context = context, **kwargs)
|
||||||
|
|
||||||
if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||||
# Simulate user interactions
|
# Simulate user interactions
|
||||||
@@ -1119,7 +1134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if kwargs.get("process_iframes", False):
|
if kwargs.get("process_iframes", False):
|
||||||
page = await self.process_iframes(page)
|
page = await self.process_iframes(page)
|
||||||
|
|
||||||
await self.execute_hook('before_retrieve_html', page, context = context)
|
await self.execute_hook('before_retrieve_html', page, context = context, **kwargs)
|
||||||
# Check if delay_before_return_html is set then wait for that time
|
# Check if delay_before_return_html is set then wait for that time
|
||||||
delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
|
delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
|
||||||
if delay_before_return_html:
|
if delay_before_return_html:
|
||||||
@@ -1130,7 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.remove_overlay_elements(page)
|
await self.remove_overlay_elements(page)
|
||||||
|
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook('before_return_html', page, html, context = context)
|
await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
|
||||||
|
|
||||||
# Check if kwargs has screenshot=True then take screenshot
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
@@ -1394,6 +1409,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
async def export_storage_state(self, path: str = None) -> dict:
|
||||||
|
"""
|
||||||
|
Exports the current storage state (cookies, localStorage, sessionStorage)
|
||||||
|
to a JSON file at the specified path.
|
||||||
|
"""
|
||||||
|
if self.default_context:
|
||||||
|
state = await self.default_context.storage_state(path=path)
|
||||||
|
self.logger.info(
|
||||||
|
message="Exported storage state to {path}",
|
||||||
|
tag="INFO",
|
||||||
|
params={"path": path}
|
||||||
|
)
|
||||||
|
return state
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
message="No default_context available to export storage state.",
|
||||||
|
tag="WARNING"
|
||||||
|
)
|
||||||
|
|
||||||
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
|
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -182,6 +182,10 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: The result of crawling and processing
|
CrawlResult: The result of crawling and processing
|
||||||
"""
|
"""
|
||||||
|
# Check if url is not string and is not empty
|
||||||
|
if not isinstance(url, str) or not url:
|
||||||
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||||
|
|
||||||
async with self._lock or nullcontext():
|
async with self._lock or nullcontext():
|
||||||
try:
|
try:
|
||||||
# Handle deprecated parameters
|
# Handle deprecated parameters
|
||||||
@@ -335,7 +339,8 @@ class AsyncWebCrawler:
|
|||||||
# print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
|
# print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
|
||||||
|
|
||||||
self.logger.error_status(
|
self.logger.error_status(
|
||||||
url=cache_context.display_url,
|
# url=cache_context.display_url,
|
||||||
|
url=url,
|
||||||
error=create_box_message(e.msg, type = "error"),
|
error=create_box_message(e.msg, type = "error"),
|
||||||
tag="ERROR"
|
tag="ERROR"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -632,7 +632,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
|
|||||||
# Sort summaries by the original section index to maintain order
|
# Sort summaries by the original section index to maintain order
|
||||||
summaries.sort(key=lambda x: x[0])
|
summaries.sort(key=lambda x: x[0])
|
||||||
return [summary for _, summary in summaries]
|
return [summary for _, summary in summaries]
|
||||||
|
|
||||||
class JsonCssExtractionStrategy(ExtractionStrategy):
|
class JsonCssExtractionStrategy(ExtractionStrategy):
|
||||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|||||||
@@ -147,6 +147,7 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
# self.preserved_content.append(data)
|
# self.preserved_content.append(data)
|
||||||
# return
|
# return
|
||||||
# super().handle_data(data, entity_char)
|
# super().handle_data(data, entity_char)
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -342,7 +342,7 @@ app.add_middleware(
|
|||||||
|
|
||||||
# API token security
|
# API token security
|
||||||
security = HTTPBearer()
|
security = HTTPBearer()
|
||||||
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
|
||||||
|
|
||||||
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
|
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
|
||||||
if not CRAWL4AI_API_TOKEN:
|
if not CRAWL4AI_API_TOKEN:
|
||||||
|
|||||||
Reference in New Issue
Block a user