From 415c1c5bee6da143b36ae0f348f1fb3eb46c0de6 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 4 Mar 2025 18:23:55 +0800
Subject: [PATCH] refactor(core): replace float('inf') with math.inf

Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code.

No breaking changes.
---
 crawl4ai/async_dispatcher.py                  |  4 +-
 crawl4ai/deep_crawling/bff_strategy.py        |  3 +-
 crawl4ai/deep_crawling/bfs_strategy.py        |  5 +-
 .../examples/llm_extraction_openai_pricing.py | 11 ++--
 docs/examples/quickstart_async.config.py      | 65 ++-----------------
 5 files changed, 16 insertions(+), 72 deletions(-)

diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
index 56c4d567..69d276fb 100644
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -24,6 +24,8 @@ from urllib.parse import urlparse
 import random
 from abc import ABC, abstractmethod
 
+from math import inf as infinity
+
 
 class RateLimiter:
     def __init__(
@@ -250,7 +252,7 @@ class CrawlerMonitor:
             key=lambda x: (
                 x.status != CrawlStatus.IN_PROGRESS,
                 x.status != CrawlStatus.QUEUED,
-                x.end_time or float('inf'),
+                x.end_time or infinity,
             ),
         )[: self.max_visible_rows]
 
diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py
index fc2047d2..4811ba14 100644
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -12,6 +12,7 @@ from . import DeepCrawlStrategy
 
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
 
+from math import inf as infinity
 
 # Configurable batch size for processing items from the priority queue
 BATCH_SIZE = 10
@@ -37,7 +38,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
         filter_chain: FilterChain = FilterChain(),
         url_scorer: Optional[URLScorer] = None,
         include_external: bool = False,
-        max_pages: int = float('inf'),
+        max_pages: int = infinity,
         logger: Optional[logging.Logger] = None,
     ):
         self.max_depth = max_depth
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
index dcdee30c..1ae4c4b9 100644
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -10,6 +10,7 @@ from .filters import FilterChain
 from .scorers import URLScorer
 from . import DeepCrawlStrategy  
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from math import inf as infinity
 
 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
     """
@@ -26,8 +27,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
         filter_chain: FilterChain = FilterChain(),
         url_scorer: Optional[URLScorer] = None,        
         include_external: bool = False,
-        score_threshold: float = float('-inf'),
-        max_pages: int = float('inf'),
+        score_threshold: float = -infinity,
+        max_pages: int = infinity,
         logger: Optional[logging.Logger] = None,
     ):
         self.max_depth = max_depth
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
index 72742bd5..d77fe337 100644
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,10 +1,11 @@
 from crawl4ai.async_configs import LlmConfig
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
+from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 import asyncio
+import os
+import json
 from pydantic import BaseModel, Field
 
-url = r"https://openai.com/api/pricing/"
+url = "https://openai.com/api/pricing/"
 
 
 class OpenAIModelFee(BaseModel):
@@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel):
         ..., description="Fee for output token for the OpenAI model."
     )
 
-
-from crawl4ai import AsyncWebCrawler
-
-
 async def main():
     # Use AsyncWebCrawler
     async with AsyncWebCrawler() as crawler:
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py
index 450880a9..184158c3 100644
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
 
 
 async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
     crawl_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
@@ -507,6 +508,9 @@ async def ssl_certification():
         if result.success and result.ssl_certificate:
             cert = result.ssl_certificate
 
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
             # 1. Access certificate properties directly
             print("\nCertificate Information:")
             print(f"Issuer: {cert.issuer.get('CN', '')}")
@@ -529,67 +533,6 @@ async def ssl_certification():
             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
 
 
-# Speed Comparison
-async def speed_comparison():
-    print("\n--- Speed Comparison ---")
-
-    # Firecrawl comparison
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    # Crawl4AI comparisons
-    browser_config = BrowserConfig(headless=True)
-
-    # Simple crawl
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS, word_count_threshold=0
-            ),
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Advanced filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS,
-                word_count_threshold=0,
-                markdown_generator=DefaultMarkdownGenerator(
-                    content_filter=PruningContentFilter(
-                        threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                    )
-                ),
-            ),
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-
 # Main execution
 async def main():
     # Basic examples