refactor: improve error handling in DataProcessor and optimize data parsing logic
This commit is contained in:
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import Optional, List, Union
|
||||
import json
|
||||
import asyncio
|
||||
from contextlib import nullcontext
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
@@ -67,6 +68,7 @@ class AsyncWebCrawler:
|
||||
always_bypass_cache: bool = False,
|
||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
thread_safe: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@@ -104,6 +106,8 @@ class AsyncWebCrawler:
|
||||
else:
|
||||
self.always_bypass_cache = always_bypass_cache
|
||||
|
||||
self._lock = asyncio.Lock() if thread_safe else None
|
||||
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
@@ -178,6 +182,7 @@ class AsyncWebCrawler:
|
||||
Returns:
|
||||
CrawlResult: The result of crawling and processing
|
||||
"""
|
||||
async with self._lock or nullcontext():
|
||||
try:
|
||||
# Handle deprecated parameters
|
||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||
|
||||
Reference in New Issue
Block a user