refactor: improve error handling in DataProcessor and optimize data parsing logic

This commit is contained in:
UncleCode
2024-12-03 19:44:38 +08:00
parent 95a4f74d2a
commit e9639ad189

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from typing import Optional, List, Union from typing import Optional, List, Union
import json import json
import asyncio import asyncio
from contextlib import nullcontext
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * from .chunking_strategy import *
@@ -67,6 +68,7 @@ class AsyncWebCrawler:
always_bypass_cache: bool = False, always_bypass_cache: bool = False,
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
thread_safe: bool = False,
**kwargs, **kwargs,
): ):
""" """
@@ -104,6 +106,8 @@ class AsyncWebCrawler:
else: else:
self.always_bypass_cache = always_bypass_cache self.always_bypass_cache = always_bypass_cache
self._lock = asyncio.Lock() if thread_safe else None
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
@@ -178,6 +182,7 @@ class AsyncWebCrawler:
Returns: Returns:
CrawlResult: The result of crawling and processing CrawlResult: The result of crawling and processing
""" """
async with self._lock or nullcontext():
try: try:
# Handle deprecated parameters # Handle deprecated parameters
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):