refactor: improve error handling in DataProcessor and optimize data parsing logic
This commit is contained in:
@@ -7,6 +7,7 @@ from pathlib import Path
|
|||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from contextlib import nullcontext
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import *
|
from .chunking_strategy import *
|
||||||
@@ -67,6 +68,7 @@ class AsyncWebCrawler:
|
|||||||
always_bypass_cache: bool = False,
|
always_bypass_cache: bool = False,
|
||||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||||
|
thread_safe: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -104,6 +106,8 @@ class AsyncWebCrawler:
|
|||||||
else:
|
else:
|
||||||
self.always_bypass_cache = always_bypass_cache
|
self.always_bypass_cache = always_bypass_cache
|
||||||
|
|
||||||
|
self._lock = asyncio.Lock() if thread_safe else None
|
||||||
|
|
||||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
@@ -178,6 +182,7 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: The result of crawling and processing
|
CrawlResult: The result of crawling and processing
|
||||||
"""
|
"""
|
||||||
|
async with self._lock or nullcontext():
|
||||||
try:
|
try:
|
||||||
# Handle deprecated parameters
|
# Handle deprecated parameters
|
||||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||||
|
|||||||
Reference in New Issue
Block a user