Fix async generator type mismatch in Docker Client streaming

- Fixed single_result_generator to properly handle async generators from deep crawl strategies
- Added proper __aiter__ checking to distinguish between CrawlResult and async generators
- Await and yield individual results from nested async generators
- Streaming functionality now works correctly for all patterns (SDK, Direct API, Docker Client)
- All 22 comprehensive tests passing with 100% success rate
- Live streaming test confirmed working end-to-end
This commit is contained in:
AHMET YILMAZ
2025-08-15 15:49:11 +08:00
parent 07e9d651fb
commit 8e1362acf5
3 changed files with 641 additions and 33 deletions

View File

@@ -113,8 +113,12 @@ class Crawl4aiDockerClient:
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
if is_streaming:
# Create and return the async generator directly
return self._stream_crawl_results(data)
# For streaming, we need to return the async generator properly
# The caller should be able to do: async for result in await client.crawl(...)
async def streaming_wrapper():
async for result in self._stream_crawl_results(data):
yield result
return streaming_wrapper()
response = await self._request("POST", "/crawl", json=data)
result_data = response.json()
@@ -131,17 +135,27 @@ class Crawl4aiDockerClient:
response.raise_for_status()
async for line in response.aiter_lines():
if line.strip():
result = json.loads(line)
if "error" in result:
self.logger.error_status(url=result.get("url", "unknown"), error=result["error"])
try:
result = json.loads(line)
if "error" in result:
self.logger.error_status(url=result.get("url", "unknown"), error=result["error"])
continue
# Check if this is a crawl result (has required fields)
if "url" in result and "success" in result:
self.logger.url_status(url=result.get("url", "unknown"), success=result.get("success", False), timing=result.get("timing", 0.0))
# Create CrawlResult object properly
crawl_result = CrawlResult(**result)
yield crawl_result
# Skip status-only messages
elif result.get("status") == "completed":
continue
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse streaming response: {e}", tag="STREAM")
continue
# Check if this is a crawl result (has required fields)
if "url" in result and "success" in result:
self.logger.url_status(url=result.get("url", "unknown"), success=result.get("success", False), timing=result.get("timing", 0.0))
yield CrawlResult(**result)
# Skip status-only messages
elif result.get("status") == "completed":
except Exception as e:
self.logger.error(f"Error processing streaming result: {e}", tag="STREAM")
continue
async def get_schema(self) -> Dict[str, Any]: