Modify the test file

2025-04-15 22:28:01 +08:00
parent 230f22da86
commit 5206c6f2d6
1 changed files with 380 additions and 380 deletions
--- a/tests/docker/test_rest_api_deep_crawl.py
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -119,411 +119,411 @@ class TestDeepCrawlEndpoints:
        await check_server_health(async_client)
    # 1. Basic Deep Crawl
-    # async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with limited depth and pages."""
+        """Test BFS deep crawl with limited depth and pages."""
-    #     max_depth = 1
+        max_depth = 1
-    #     max_pages = 3 # start_url + 2 more
+        max_pages = 3 # start_url + 2 more
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS", # Use string value for CacheMode
+                    "cache_mode": "BYPASS", # Use string value for CacheMode
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                         # Minimal filters for basic test
+                            # Minimal filters for basic test
-    #                         "filter_chain": {
+                            "filter_chain": {
-    #                             "type": "FilterChain",
+                                "type": "FilterChain",
-    #                             "params": {
+                                "params": {
-    #                                 "filters": [
+                                    "filters": [
-    #                                     {
+                                        {
-    #                                         "type": "DomainFilter",
+                                            "type": "DomainFilter",
-    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
-    #                                     }
+                                        }
-    #                                 ]
+                                    ]
-    #                             }
+                                }
-    #                         }
+                            }
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     response = await async_client.post("/crawl", json=payload)
+        response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
+        response.raise_for_status()
-    #     data = response.json()
+        data = response.json()
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert isinstance(data["results"], list)
+        assert isinstance(data["results"], list)
-    #     assert len(data["results"]) > 1 # Should be more than just the start URL
+        assert len(data["results"]) > 1 # Should be more than just the start URL
-    #     assert len(data["results"]) <= max_pages # Respect max_pages
+        assert len(data["results"]) <= max_pages # Respect max_pages
-    #     found_depth_0 = False
+        found_depth_0 = False
-    #     found_depth_1 = False
+        found_depth_1 = False
-    #     for result in data["results"]:
+        for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
+            await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
+            assert result["success"] is True
-    #         assert DEEP_CRAWL_DOMAIN in result["url"]
+            assert DEEP_CRAWL_DOMAIN in result["url"]
-    #         depth = result["metadata"]["depth"]
+            depth = result["metadata"]["depth"]
-    #         assert depth <= max_depth
+            assert depth <= max_depth
-    #         if depth == 0: found_depth_0 = True
+            if depth == 0: found_depth_0 = True
-    #         if depth == 1: found_depth_1 = True
+            if depth == 1: found_depth_1 = True
-    #     assert found_depth_0
+        assert found_depth_0
-    #     assert found_depth_1
+        assert found_depth_1
-    # # 2. Deep Crawl with Filtering
+    # 2. Deep Crawl with Filtering
-    # async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with content type and domain filters."""
+        """Test BFS deep crawl with content type and domain filters."""
-    #     max_depth = 1
+        max_depth = 1
-    #     max_pages = 5
+        max_pages = 5
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS",
+                    "cache_mode": "BYPASS",
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                         "filter_chain": {
+                            "filter_chain": {
-    #                             "type": "FilterChain",
+                                "type": "FilterChain",
-    #                             "params": {
+                                "params": {
-    #                                 "filters": [
+                                    "filters": [
-    #                                     {
+                                        {
-    #                                         "type": "DomainFilter",
+                                            "type": "DomainFilter",
-    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
-    #                                     },
+                                        },
-    #                                     {
+                                        {
-    #                                         "type": "ContentTypeFilter",
+                                            "type": "ContentTypeFilter",
-    #                                         "params": {"allowed_types": ["text/html"]}
+                                            "params": {"allowed_types": ["text/html"]}
-    #                                     },
+                                        },
-    #                                     # Example: Exclude specific paths using regex
+                                        # Example: Exclude specific paths using regex
-    #                                     {
+                                        {
-    #                                         "type": "URLPatternFilter",
+                                            "type": "URLPatternFilter",
-    #                                          "params": {
+                                             "params": {
-    #                                              "patterns": ["*/category-3/*"], # Block category 3
+                                                 "patterns": ["*/category-3/*"], # Block category 3
-    #                                              "reverse": True # Block if match
+                                                 "reverse": True # Block if match
-    #                                          }
+                                             }
-    #                                     }
+                                        }
-    #                                 ]
+                                    ]
-    #                             }
+                                }
-    #                         }
+                            }
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     response = await async_client.post("/crawl", json=payload)
+        response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
+        response.raise_for_status()
-    #     data = response.json()
+        data = response.json()
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert len(data["results"]) > 0
+        assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert len(data["results"]) <= max_pages
-    #     for result in data["results"]:
+        for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
+            await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
+            assert result["success"] is True
-    #         assert DEEP_CRAWL_DOMAIN in result["url"]
+            assert DEEP_CRAWL_DOMAIN in result["url"]
-    #         assert "category-3" not in result["url"] # Check if filter worked
+            assert "category-3" not in result["url"] # Check if filter worked
-    #         assert result["metadata"]["depth"] <= max_depth
+            assert result["metadata"]["depth"] <= max_depth
-    # # 3. Deep Crawl with Scoring
+    # 3. Deep Crawl with Scoring
-    # async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with URL scoring."""
+        """Test BFS deep crawl with URL scoring."""
-    #     max_depth = 1
+        max_depth = 1
-    #     max_pages = 4
+        max_pages = 4
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS",
+                    "cache_mode": "BYPASS",
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                         "filter_chain": { # Keep basic domain filter
+                            "filter_chain": { # Keep basic domain filter
-    #                             "type": "FilterChain",
+                                "type": "FilterChain",
-    #                             "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
-    #                         },
+                            },
-    #                         "url_scorer": { # Add scorer
+                            "url_scorer": { # Add scorer
-    #                             "type": "CompositeScorer",
+                                "type": "CompositeScorer",
-    #                             "params": {
+                                "params": {
-    #                                 "scorers": [
+                                    "scorers": [
-    #                                     {   # Favor pages with 'product' in the URL
+                                        {   # Favor pages with 'product' in the URL
-    #                                         "type": "KeywordRelevanceScorer",
+                                            "type": "KeywordRelevanceScorer",
-    #                                         "params": {"keywords": ["product"], "weight": 1.0}
+                                            "params": {"keywords": ["product"], "weight": 1.0}
-    #                                     },
+                                        },
-    #                                     {   # Penalize deep paths slightly
+                                        {   # Penalize deep paths slightly
-    #                                         "type": "PathDepthScorer",
+                                            "type": "PathDepthScorer",
-    #                                         "params": {"optimal_depth": 2, "weight": -0.2}
+                                            "params": {"optimal_depth": 2, "weight": -0.2}
-    #                                     }
+                                        }
-    #                                 ]
+                                    ]
-    #                             }
+                                }
-    #                         },
+                            },
-    #                         # Set a threshold if needed: "score_threshold": 0.1
+                            # Set a threshold if needed: "score_threshold": 0.1
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     response = await async_client.post("/crawl", json=payload)
+        response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
+        response.raise_for_status()
-    #     data = response.json()
+        data = response.json()
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert len(data["results"]) > 0
+        assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert len(data["results"]) <= max_pages
-    #     # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
+        # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
-    #     product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
+        product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
-    #     print(f"Product URLs found among depth > 0 results: {product_urls_found}")
+        print(f"Product URLs found among depth > 0 results: {product_urls_found}")
-    #     # We expect scoring to prioritize product pages if available within limits
+        # We expect scoring to prioritize product pages if available within limits
-    #     # assert product_urls_found # This might be too strict depending on site structure and limits
+        # assert product_urls_found # This might be too strict depending on site structure and limits
-    #     for result in data["results"]:
+        for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
+            await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
+            assert result["success"] is True
-    #         assert result["metadata"]["depth"] <= max_depth
+            assert result["metadata"]["depth"] <= max_depth
-    # # 4. Deep Crawl with CSS Extraction
+    # 4. Deep Crawl with CSS Extraction
-    # async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
+        """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
-    #     max_depth = 6 # Go deep enough to reach product pages
+        max_depth = 6 # Go deep enough to reach product pages
-    #     max_pages = 20
+        max_pages = 20
-    #     # Schema to extract product details
+        # Schema to extract product details
-    #     product_schema = {
+        product_schema = {
-    #         "name": "ProductDetails",
+            "name": "ProductDetails",
-    #         "baseSelector": "div.container", # Base for product page
+            "baseSelector": "div.container", # Base for product page
-    #         "fields": [
+            "fields": [
-    #             {"name": "product_title", "selector": "h1", "type": "text"},
+                {"name": "product_title", "selector": "h1", "type": "text"},
-    #             {"name": "price", "selector": ".product-price", "type": "text"},
+                {"name": "price", "selector": ".product-price", "type": "text"},
-    #             {"name": "description", "selector": ".product-description p", "type": "text"},
+                {"name": "description", "selector": ".product-description p", "type": "text"},
-    #             {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
+                {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
-    #                  {"name": "spec_name", "selector": ".spec-name", "type": "text"},
+                     {"name": "spec_name", "selector": ".spec-name", "type": "text"},
-    #                  {"name": "spec_value", "selector": ".spec-value", "type": "text"}
+                     {"name": "spec_value", "selector": ".spec-value", "type": "text"}
-    #             ]}
+                ]}
-    #         ]
+            ]
-    #     }
+        }
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS",
+                    "cache_mode": "BYPASS",
-    #                 "extraction_strategy": { # Apply extraction to ALL crawled pages
+                    "extraction_strategy": { # Apply extraction to ALL crawled pages
-    #                     "type": "JsonCssExtractionStrategy",
+                        "type": "JsonCssExtractionStrategy",
-    #                     "params": {"schema": {"type": "dict", "value": product_schema}}
+                        "params": {"schema": {"type": "dict", "value": product_schema}}
-    #                 },
+                    },
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                         "filter_chain": { # Only crawl HTML on our domain
+                            "filter_chain": { # Only crawl HTML on our domain
-    #                             "type": "FilterChain",
+                                "type": "FilterChain",
-    #                             "params": {
+                                "params": {
-    #                                 "filters": [
+                                    "filters": [
-    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-    #                                 ]
+                                    ]
-    #                             }
+                                }
-    #                         }
+                            }
-    #                         # Optional: Add scoring to prioritize product pages for extraction
+                            # Optional: Add scoring to prioritize product pages for extraction
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     response = await async_client.post("/crawl", json=payload)
+        response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
+        response.raise_for_status()
-    #     data = response.json()
+        data = response.json()
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert len(data["results"]) > 0
+        assert len(data["results"]) > 0
-    #     # assert len(data["results"]) <= max_pages
+        # assert len(data["results"]) <= max_pages
-    #     found_extracted_product = False
+        found_extracted_product = False
-    #     for result in data["results"]:
+        for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
+            await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
+            assert result["success"] is True
-    #         assert "extracted_content" in result
+            assert "extracted_content" in result
-    #         if "product_" in result["url"]: # Check product pages specifically
+            if "product_" in result["url"]: # Check product pages specifically
-    #              assert result["extracted_content"] is not None
+                 assert result["extracted_content"] is not None
-    #              try:
+                 try:
-    #                  extracted = json.loads(result["extracted_content"])
+                     extracted = json.loads(result["extracted_content"])
-    #                  # Schema returns list even if one base match
+                     # Schema returns list even if one base match
-    #                  assert isinstance(extracted, list)
+                     assert isinstance(extracted, list)
-    #                  if extracted:
+                     if extracted:
-    #                      item = extracted[0]
+                         item = extracted[0]
-    #                      assert "product_title" in item and item["product_title"]
+                         assert "product_title" in item and item["product_title"]
-    #                      assert "price" in item and item["price"]
+                         assert "price" in item and item["price"]
-    #                      # Specs might be empty list if not found
+                         # Specs might be empty list if not found
-    #                      assert "specs" in item and isinstance(item["specs"], list)
+                         assert "specs" in item and isinstance(item["specs"], list)
-    #                      found_extracted_product = True
+                         found_extracted_product = True
-    #                      print(f"Extracted product: {item.get('product_title')}")
+                         print(f"Extracted product: {item.get('product_title')}")
-    #              except (json.JSONDecodeError, AssertionError, IndexError) as e:
+                 except (json.JSONDecodeError, AssertionError, IndexError) as e:
-    #                   pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+                      pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
-    #         # else:
+            # else:
-    #         #      # Non-product pages might have None or empty list depending on schema match
+            #      # Non-product pages might have None or empty list depending on schema match
-    #         #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
+            #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
-    #     assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
+        assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
-    # # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
+    # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
-    # async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl combined with LLMExtractionStrategy."""
+        """Test BFS deep crawl combined with LLMExtractionStrategy."""
-    #     max_depth = 1 # Limit depth to keep LLM calls manageable
+        max_depth = 1 # Limit depth to keep LLM calls manageable
-    #     max_pages = 3
+        max_pages = 3
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS",
+                    "cache_mode": "BYPASS",
-    #                 "extraction_strategy": { # Apply LLM extraction to crawled pages
+                    "extraction_strategy": { # Apply LLM extraction to crawled pages
-    #                     "type": "LLMExtractionStrategy",
+                        "type": "LLMExtractionStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "instruction": "Extract the main H1 title and the text content of the first paragraph.",
+                            "instruction": "Extract the main H1 title and the text content of the first paragraph.",
-    #                         "llm_config": { # Example override, rely on server default if possible
+                            "llm_config": { # Example override, rely on server default if possible
-    #                            "type": "LLMConfig",
+                               "type": "LLMConfig",
-    #                            "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
+                               "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
-    #                         },
+                            },
-    #                          "schema": { # Expected JSON output
+                             "schema": { # Expected JSON output
-    #                             "type": "dict",
+                                "type": "dict",
-    #                             "value": {
+                                "value": {
-    #                                 "title": "PageContent", "type": "object",
+                                    "title": "PageContent", "type": "object",
-    #                                 "properties": {
+                                    "properties": {
-    #                                     "h1_title": {"type": "string"},
+                                        "h1_title": {"type": "string"},
-    #                                     "first_paragraph": {"type": "string"}
+                                        "first_paragraph": {"type": "string"}
-    #                                 }
+                                    }
-    #                             }
+                                }
-    #                         }
+                            }
-    #                     }
+                        }
-    #                 },
+                    },
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                         "filter_chain": {
+                            "filter_chain": {
-    #                             "type": "FilterChain",
+                                "type": "FilterChain",
-    #                             "params": {
+                                "params": {
-    #                                 "filters": [
+                                    "filters": [
-    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-    #                                 ]
+                                    ]
-    #                             }
+                                }
-    #                         }
+                            }
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     try:
+        try:
-    #         response = await async_client.post("/crawl", json=payload)
+            response = await async_client.post("/crawl", json=payload)
-    #         response.raise_for_status()
+            response.raise_for_status()
-    #         data = response.json()
+            data = response.json()
-    #     except httpx.HTTPStatusError as e:
+        except httpx.HTTPStatusError as e:
-    #         pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
+            pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
-    #     except httpx.RequestError as e:
+        except httpx.RequestError as e:
-    #          pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
+             pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert len(data["results"]) > 0
+        assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert len(data["results"]) <= max_pages
-    #     found_llm_extraction = False
+        found_llm_extraction = False
-    #     for result in data["results"]:
+        for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
+            await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
+            assert result["success"] is True
-    #         assert "extracted_content" in result
+            assert "extracted_content" in result
-    #         assert result["extracted_content"] is not None
+            assert result["extracted_content"] is not None
-    #         try:
+            try:
-    #             extracted = json.loads(result["extracted_content"])
+                extracted = json.loads(result["extracted_content"])
-    #             if isinstance(extracted, list): extracted = extracted[0] # Handle list output
+                if isinstance(extracted, list): extracted = extracted[0] # Handle list output
-    #             assert isinstance(extracted, dict)
+                assert isinstance(extracted, dict)
-    #             assert "h1_title" in extracted # Check keys based on schema
+                assert "h1_title" in extracted # Check keys based on schema
-    #             assert "first_paragraph" in extracted
+                assert "first_paragraph" in extracted
-    #             found_llm_extraction = True
+                found_llm_extraction = True
-    #             print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
+                print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
-    #         except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
+            except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
-    #             pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+                pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
-    #     assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
+        assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
-    # # 6. Deep Crawl with SSL Certificate Fetching
+    # 6. Deep Crawl with SSL Certificate Fetching
-    # async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
+    async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+        """Test BFS deep crawl with fetch_ssl_certificate enabled."""
-    #     max_depth = 0 # Only fetch for start URL to keep test fast
+        max_depth = 0 # Only fetch for start URL to keep test fast
-    #     max_pages = 1
+        max_pages = 1
-    #     payload = {
+        payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
+            "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
+            "crawler_config": {
-    #             "type": "CrawlerRunConfig",
+                "type": "CrawlerRunConfig",
-    #             "params": {
+                "params": {
-    #                 "stream": False,
+                    "stream": False,
-    #                 "cache_mode": "BYPASS",
+                    "cache_mode": "BYPASS",
-    #                 "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+                    "fetch_ssl_certificate": True, # <-- Enable SSL fetching
-    #                 "deep_crawl_strategy": {
+                    "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
+                        "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
+                        "params": {
-    #                         "max_depth": max_depth,
+                            "max_depth": max_depth,
-    #                         "max_pages": max_pages,
+                            "max_pages": max_pages,
-    #                     }
+                        }
-    #                 }
+                    }
-    #             }
+                }
-    #         }
+            }
-    #     }
+        }
-    #     response = await async_client.post("/crawl", json=payload)
+        response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
+        response.raise_for_status()
-    #     data = response.json()
+        data = response.json()
-    #     assert data["success"] is True
+        assert data["success"] is True
-    #     assert len(data["results"]) == 1
+        assert len(data["results"]) == 1
-    #     result = data["results"][0]
+        result = data["results"][0]
-    #     await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
+        await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
-    #     assert result["success"] is True
+        assert result["success"] is True
-    #             # Check if SSL info was actually retrieved
+                # Check if SSL info was actually retrieved
-    #     if result["ssl_certificate"]:
+        if result["ssl_certificate"]:
-    #         # Assert directly using dictionary keys
+            # Assert directly using dictionary keys
-    #         assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
+            assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
-    #         assert "issuer" in result["ssl_certificate"]
+            assert "issuer" in result["ssl_certificate"]
-    #         assert "subject" in result["ssl_certificate"]
+            assert "subject" in result["ssl_certificate"]
-    #         # --- MODIFIED ASSERTIONS ---
+            # --- MODIFIED ASSERTIONS ---
-    #         assert "not_before" in result["ssl_certificate"] # Check for the actual key
+            assert "not_before" in result["ssl_certificate"] # Check for the actual key
-    #         assert "not_after" in result["ssl_certificate"]  # Check for the actual key
+            assert "not_after" in result["ssl_certificate"]  # Check for the actual key
-    #         # --- END MODIFICATIONS ---
+            # --- END MODIFICATIONS ---
-    #         assert "fingerprint" in result["ssl_certificate"] # Check another key
+            assert "fingerprint" in result["ssl_certificate"] # Check another key
-    #         # This print statement using .get() already works correctly with dictionaries
+            # This print statement using .get() already works correctly with dictionaries
-    #         print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
+            print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
-    #         print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
+            print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
-    #     else:
+        else:
-    #         # This part remains the same
+            # This part remains the same
-    #         print("SSL Certificate was null in the result.")
+            print("SSL Certificate was null in the result.")
    # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)