From 415c1c5bee6da143b36ae0f348f1fb3eb46c0de6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 4 Mar 2025 18:23:55 +0800 Subject: [PATCH] refactor(core): replace float('inf') with math.inf Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code. No breaking changes. --- crawl4ai/async_dispatcher.py | 4 +- crawl4ai/deep_crawling/bff_strategy.py | 3 +- crawl4ai/deep_crawling/bfs_strategy.py | 5 +- .../examples/llm_extraction_openai_pricing.py | 11 ++-- docs/examples/quickstart_async.config.py | 65 ++----------------- 5 files changed, 16 insertions(+), 72 deletions(-) diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 56c4d567..69d276fb 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -24,6 +24,8 @@ from urllib.parse import urlparse import random from abc import ABC, abstractmethod +from math import inf as infinity + class RateLimiter: def __init__( @@ -250,7 +252,7 @@ class CrawlerMonitor: key=lambda x: ( x.status != CrawlStatus.IN_PROGRESS, x.status != CrawlStatus.QUEUED, - x.end_time or float('inf'), + x.end_time or infinity, ), )[: self.max_visible_rows] diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index fc2047d2..4811ba14 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -12,6 +12,7 @@ from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn +from math import inf as infinity # Configurable batch size for processing items from the priority queue BATCH_SIZE = 10 @@ -37,7 +38,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): filter_chain: FilterChain = FilterChain(), url_scorer: Optional[URLScorer] = None, include_external: bool = False, - max_pages: int = float('inf'), + max_pages: int = infinity, logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index dcdee30c..1ae4c4b9 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -10,6 +10,7 @@ from .filters import FilterChain from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult +from math import inf as infinity class BFSDeepCrawlStrategy(DeepCrawlStrategy): """ @@ -26,8 +27,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): filter_chain: FilterChain = FilterChain(), url_scorer: Optional[URLScorer] = None, include_external: bool = False, - score_threshold: float = float('-inf'), - max_pages: int = float('inf'), + score_threshold: float = -infinity, + max_pages: int = infinity, logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index 72742bd5..d77fe337 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,10 +1,11 @@ from crawl4ai.async_configs import LlmConfig -from crawl4ai.extraction_strategy import * -from crawl4ai.crawler_strategy import * +from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy import asyncio +import os +import json from pydantic import BaseModel, Field -url = r"https://openai.com/api/pricing/" +url = "https://openai.com/api/pricing/" class OpenAIModelFee(BaseModel): @@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) - -from crawl4ai import AsyncWebCrawler - - async def main(): # Use AsyncWebCrawler async with AsyncWebCrawler() as crawler: diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index 450880a9..184158c3 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): + from crawl4ai.extraction_strategy import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( @@ -507,6 +508,9 @@ async def ssl_certification(): if result.success and result.ssl_certificate: cert = result.ssl_certificate + tmp_dir = os.path.join(__location__, "tmp") + os.makedirs(tmp_dir, exist_ok=True) + # 1. Access certificate properties directly print("\nCertificate Information:") print(f"Issuer: {cert.issuer.get('CN', '')}") @@ -529,67 +533,6 @@ async def ssl_certification(): print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") -# Speed Comparison -async def speed_comparison(): - print("\n--- Speed Comparison ---") - - # Firecrawl comparison - from firecrawl import FirecrawlApp - - app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) - start = time.time() - scrape_status = app.scrape_url( - "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} - ) - end = time.time() - print("Firecrawl:") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(scrape_status['markdown'])} characters") - print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") - print() - - # Crawl4AI comparisons - browser_config = BrowserConfig(headless=True) - - # Simple crawl - async with AsyncWebCrawler(config=browser_config) as crawler: - start = time.time() - result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, word_count_threshold=0 - ), - ) - end = time.time() - print("Crawl4AI (simple crawl):") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(result.markdown)} characters") - print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") - print() - - # Advanced filtering - start = time.time() - result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - word_count_threshold=0, - markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter( - threshold=0.48, threshold_type="fixed", min_word_threshold=0 - ) - ), - ), - ) - end = time.time() - print("Crawl4AI (Markdown Plus):") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(result.markdown.raw_markdown)} characters") - print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters") - print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}") - print() - - # Main execution async def main(): # Basic examples