chore: Bump version to 0.3.71 and improve error handling
- Update version number to 0.3.71 - Add sleep_on_close option to AsyncPlaywrightCrawlerStrategy - Enhance context creation with additional options - Improve error message formatting and visibility - Update quickstart documentation
This commit is contained in:
27
CHANGELOG.md
27
CHANGELOG.md
@@ -1,5 +1,32 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.3.71] - 2024-10-18
|
||||||
|
|
||||||
|
### Changes
|
||||||
|
1. **Version Update**:
|
||||||
|
- Updated version number from 0.3.7 to 0.3.71.
|
||||||
|
|
||||||
|
2. **Crawler Enhancements**:
|
||||||
|
- Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
|
||||||
|
- Improved context creation with additional options:
|
||||||
|
- Enabled `accept_downloads` and `java_script_enabled`.
|
||||||
|
- Added a cookie to enable cookies by default.
|
||||||
|
|
||||||
|
3. **Error Handling Improvements**:
|
||||||
|
- Enhanced error messages in AsyncWebCrawler's `arun` method.
|
||||||
|
- Updated error reporting format for better visibility and consistency.
|
||||||
|
|
||||||
|
4. **Performance Optimization**:
|
||||||
|
- Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
- Updated quickstart notebook:
|
||||||
|
- Changed installation command to use the released package instead of GitHub repository.
|
||||||
|
- Updated kernel display name.
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Minor code refactoring and cleanup.
|
||||||
|
|
||||||
## [v0.3.7] - 2024-10-17
|
## [v0.3.7] - 2024-10-17
|
||||||
|
|
||||||
### New Features
|
### New Features
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.8"
|
__version__ = "0.3.71"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
@@ -80,6 +80,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
self.browser = None
|
self.browser = None
|
||||||
|
self.sleep_on_close = kwargs.get("sleep_on_close", False)
|
||||||
self.hooks = {
|
self.hooks = {
|
||||||
'on_browser_created': None,
|
'on_browser_created': None,
|
||||||
'on_user_agent_updated': None,
|
'on_user_agent_updated': None,
|
||||||
@@ -132,6 +133,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.execute_hook('on_browser_created', self.browser)
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
|
if self.sleep_on_close:
|
||||||
|
await asyncio.sleep(500)
|
||||||
if self.browser:
|
if self.browser:
|
||||||
await self.browser.close()
|
await self.browser.close()
|
||||||
self.browser = None
|
self.browser = None
|
||||||
@@ -296,8 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
context = await self.browser.new_context(
|
context = await self.browser.new_context(
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1920, "height": 1080},
|
||||||
proxy={"server": self.proxy} if self.proxy else None
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
|
accept_downloads=True,
|
||||||
|
java_script_enabled=True
|
||||||
)
|
)
|
||||||
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
self.sessions[session_id] = (context, page, time.time())
|
self.sessions[session_id] = (context, page, time.time())
|
||||||
@@ -419,8 +425,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Update image dimensions
|
# Update image dimensions
|
||||||
update_image_dimensions_js = """
|
update_image_dimensions_js = """
|
||||||
() => {
|
() => {
|
||||||
@@ -531,11 +535,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}")
|
||||||
finally:
|
# finally:
|
||||||
if not session_id:
|
# if not session_id:
|
||||||
await page.close()
|
# await page.close()
|
||||||
await context.close()
|
# await context.close()
|
||||||
|
|
||||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||||
|
|||||||
@@ -133,8 +133,8 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
|
||||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||||
|
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -47,8 +47,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# !pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git\"\n",
|
"!pip install crawl4ai\n",
|
||||||
"!pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git@staging\"\n",
|
|
||||||
"!pip install nest-asyncio\n",
|
"!pip install nest-asyncio\n",
|
||||||
"!playwright install"
|
"!playwright install"
|
||||||
]
|
]
|
||||||
@@ -714,7 +713,7 @@
|
|||||||
"provenance": []
|
"provenance": []
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user