refactor: improve error handling in DataProcessor and optimize data parsing logic

This commit is contained in:
UncleCode
2024-12-03 19:44:38 +08:00
parent 95a4f74d2a
commit e9639ad189

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
from contextlib import nullcontext
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
@@ -67,6 +68,7 @@ class AsyncWebCrawler:
always_bypass_cache: bool = False,
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
thread_safe: bool = False,
**kwargs,
):
"""
@@ -104,6 +106,8 @@ class AsyncWebCrawler:
else:
self.always_bypass_cache = always_bypass_cache
self._lock = asyncio.Lock() if thread_safe else None
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
@@ -178,6 +182,7 @@ class AsyncWebCrawler:
Returns:
CrawlResult: The result of crawling and processing
"""
async with self._lock or nullcontext():
try:
# Handle deprecated parameters
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):