Add Async Version, JsonCss Extrator

2024-09-03 01:27:00 +08:00
parent 3116f95c1a
commit c37614cbc8
17 changed files with 1922 additions and 2 deletions
--- a/tests/async/test_error_handling.py
+++ b/tests/async/test_error_handling.py
@@ -0,0 +1,78 @@
+# import os
+# import sys
+# import pytest
+# import asyncio
+
+# # Add the parent directory to the Python path
+# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+# sys.path.append(parent_dir)
+
+# from crawl4ai.async_webcrawler import AsyncWebCrawler
+# from crawl4ai.utils import InvalidCSSSelectorError
+
+# class AsyncCrawlerWrapper:
+#     def __init__(self):
+#         self.crawler = None
+
+#     async def setup(self):
+#         self.crawler = AsyncWebCrawler(verbose=True)
+#         await self.crawler.awarmup()
+
+#     async def cleanup(self):
+#         if self.crawler:
+#             await self.crawler.aclear_cache()
+
+# @pytest.fixture(scope="module")
+# def crawler_wrapper():
+#     wrapper = AsyncCrawlerWrapper()
+#     asyncio.get_event_loop().run_until_complete(wrapper.setup())
+#     yield wrapper
+#     asyncio.get_event_loop().run_until_complete(wrapper.cleanup())
+
+# @pytest.mark.asyncio
+# async def test_network_error(crawler_wrapper):
+#     url = "https://www.nonexistentwebsite123456789.com"
+#     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
+#     assert not result.success
+#     assert "Failed to crawl" in result.error_message
+
+# # @pytest.mark.asyncio
+# # async def test_timeout_error(crawler_wrapper):
+# #     # Simulating a timeout by using a very short timeout value
+# #     url = "https://www.nbcnews.com/business"
+# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
+# #     assert not result.success
+# #     assert "timeout" in result.error_message.lower()
+
+# # @pytest.mark.asyncio
+# # async def test_invalid_css_selector(crawler_wrapper):
+# #     url = "https://www.nbcnews.com/business"
+# #     with pytest.raises(InvalidCSSSelectorError):
+# #         await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")
+
+# # @pytest.mark.asyncio
+# # async def test_js_execution_error(crawler_wrapper):
+# #     url = "https://www.nbcnews.com/business"
+# #     invalid_js = "This is not valid JavaScript code;"
+# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
+# #     assert not result.success
+# #     assert "JavaScript" in result.error_message
+
+# # @pytest.mark.asyncio
+# # async def test_empty_page(crawler_wrapper):
+# #     # Use a URL that typically returns an empty page
+# #     url = "http://example.com/empty"
+# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
+# #     assert result.success  # The crawl itself should succeed
+# #     assert not result.markdown.strip()  # The markdown content should be empty or just whitespace
+
+# # @pytest.mark.asyncio
+# # async def test_rate_limiting(crawler_wrapper):
+# #     # Simulate rate limiting by making multiple rapid requests
+# #     url = "https://www.nbcnews.com/business"
+# #     results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
+# #     assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)
+
+# # Entry point for debugging
+# if __name__ == "__main__":
+#     pytest.main([__file__, "-v"])