Commit Message:

- Added examples for Amazon product data extraction methods
  - Updated configuration options and enhance documentation
  - Minor refactoring for improved performance and readability
  - Cleaned up version control settings.
This commit is contained in:
UncleCode
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions

View File

@@ -69,6 +69,24 @@ class AsyncWebCrawler:
New way (recommended):
browser_config = BrowserConfig(browser_type="chromium", headless=True)
crawler = AsyncWebCrawler(config=browser_config)
Attributes:
browser_config (BrowserConfig): Configuration object for browser settings.
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
logger (AsyncLogger): Logger instance for recording events and errors.
always_bypass_cache (bool): Whether to always bypass cache.
crawl4ai_folder (str): Directory for storing cache.
base_directory (str): Base directory for storing cache.
ready (bool): Whether the crawler is ready for use.
Methods:
start(): Start the crawler explicitly without using context manager.
close(): Close the crawler explicitly without using context manager.
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
awarmup(): Perform warmup sequence.
arun_many(): Run the crawler for multiple sources.
aprocess_html(): Process HTML content.
"""
_domain_last_hit = {}
@@ -321,7 +339,7 @@ class AsyncWebCrawler:
# Initialize processing variables
async_response: AsyncCrawlResponse = None
cached_result = None
cached_result: CrawlResult = None
screenshot_data = None
pdf_data = None
extracted_content = None
@@ -373,52 +391,89 @@ class AsyncWebCrawler:
tag="FETCH"
)
# Process the HTML content
crawl_result = await self.aprocess_html(
url=url,
html=html,
extracted_content=extracted_content,
config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html = True if url.startswith("raw:") else False,
**kwargs
)
# Process the HTML content
crawl_result = await self.aprocess_html(
url=url,
html=html,
extracted_content=extracted_content,
config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html = True if url.startswith("raw:") else False,
**kwargs
)
# crawl_result.status_code = async_response.status_code
# crawl_result.response_headers = async_response.response_headers
# crawl_result.downloaded_files = async_response.downloaded_files
# crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
# else:
# crawl_result.status_code = 200
# crawl_result.response_headers = cached_result.response_headers if cached_result else {}
# crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
# # Check and set values from async_response to crawl_result
try:
for key in vars(async_response):
if hasattr(crawl_result, key):
value = getattr(async_response, key, None)
current_value = getattr(crawl_result, key, None)
if value is not None and not current_value:
try:
setattr(crawl_result, key, value)
except Exception as e:
self.logger.warning(
message=f"Failed to set attribute {key}: {str(e)}",
tag="WARNING"
)
except Exception as e:
self.logger.warning(
message=f"Error copying response attributes: {str(e)}",
tag="WARNING"
)
crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, 'session_id', None)
self.logger.success(
message="{url:.50}... | Status: {status} | Total: {timing}",
tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": crawl_result.success,
"timing": f"{time.perf_counter() - start_time:.2f}s"
},
colors={
"status": Fore.GREEN if crawl_result.success else Fore.RED,
"timing": Fore.YELLOW
}
)
# Update cache if appropriate
if cache_context.should_write() and not bool(cached_result):
await async_db_manager.acache_url(crawl_result)
return crawl_result
# Set response data
if async_response:
crawl_result.status_code = async_response.status_code
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
else:
crawl_result.status_code = 200
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
self.logger.success(
message="{url:.50}... | Status: {status} | Total: {timing}",
tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": True,
"timing": f"{time.perf_counter() - start_time:.2f}s"
},
colors={
"status": Fore.GREEN,
"timing": Fore.YELLOW
}
)
crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, 'session_id', None)
self.logger.success(
message="{url:.50}... | Status: {status} | Total: {timing}",
tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": crawl_result.success,
"timing": f"{time.perf_counter() - start_time:.2f}s"
},
colors={
"status": Fore.GREEN if crawl_result.success else Fore.RED,
"timing": Fore.YELLOW
}
)
# Update cache if appropriate
if cache_context.should_write() and not bool(cached_result):
await async_db_manager.acache_url(crawl_result)
return crawl_result
cached_result.success = bool(html)
cached_result.session_id = getattr(config, 'session_id', None)
return cached_result
except Exception as e:
error_context = get_error_context(sys.exc_info())
@@ -465,6 +520,7 @@ class AsyncWebCrawler:
extracted_content: Previously extracted content (if any)
config: Configuration object controlling processing behavior
screenshot: Screenshot data (if any)
pdf_data: PDF data (if any)
verbose: Whether to enable verbose logging
**kwargs: Additional parameters for backwards compatibility