refactor(core): replace float('inf') with math.inf

Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code.

No breaking changes.
This commit is contained in:
UncleCode
2025-03-04 18:23:55 +08:00
parent f334daa979
commit 415c1c5bee
5 changed files with 16 additions and 72 deletions

View File

@@ -24,6 +24,8 @@ from urllib.parse import urlparse
import random import random
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from math import inf as infinity
class RateLimiter: class RateLimiter:
def __init__( def __init__(
@@ -250,7 +252,7 @@ class CrawlerMonitor:
key=lambda x: ( key=lambda x: (
x.status != CrawlStatus.IN_PROGRESS, x.status != CrawlStatus.IN_PROGRESS,
x.status != CrawlStatus.QUEUED, x.status != CrawlStatus.QUEUED,
x.end_time or float('inf'), x.end_time or infinity,
), ),
)[: self.max_visible_rows] )[: self.max_visible_rows]

View File

@@ -12,6 +12,7 @@ from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
from math import inf as infinity
# Configurable batch size for processing items from the priority queue # Configurable batch size for processing items from the priority queue
BATCH_SIZE = 10 BATCH_SIZE = 10
@@ -37,7 +38,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
filter_chain: FilterChain = FilterChain(), filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None, url_scorer: Optional[URLScorer] = None,
include_external: bool = False, include_external: bool = False,
max_pages: int = float('inf'), max_pages: int = infinity,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
): ):
self.max_depth = max_depth self.max_depth = max_depth

View File

@@ -10,6 +10,7 @@ from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from . import DeepCrawlStrategy from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
from math import inf as infinity
class BFSDeepCrawlStrategy(DeepCrawlStrategy): class BFSDeepCrawlStrategy(DeepCrawlStrategy):
""" """
@@ -26,8 +27,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
filter_chain: FilterChain = FilterChain(), filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None, url_scorer: Optional[URLScorer] = None,
include_external: bool = False, include_external: bool = False,
score_threshold: float = float('-inf'), score_threshold: float = -infinity,
max_pages: int = float('inf'), max_pages: int = infinity,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
): ):
self.max_depth = max_depth self.max_depth = max_depth

View File

@@ -1,10 +1,11 @@
from crawl4ai.async_configs import LlmConfig from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import * from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
from crawl4ai.crawler_strategy import *
import asyncio import asyncio
import os
import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
url = r"https://openai.com/api/pricing/" url = "https://openai.com/api/pricing/"
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
@@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel):
..., description="Fee for output token for the OpenAI model." ..., description="Fee for output token for the OpenAI model."
) )
from crawl4ai import AsyncWebCrawler
async def main(): async def main():
# Use AsyncWebCrawler # Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:

View File

@@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction(): async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
crawl_config = CrawlerRunConfig( crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(
@@ -507,6 +508,9 @@ async def ssl_certification():
if result.success and result.ssl_certificate: if result.success and result.ssl_certificate:
cert = result.ssl_certificate cert = result.ssl_certificate
tmp_dir = os.path.join(__location__, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
# 1. Access certificate properties directly # 1. Access certificate properties directly
print("\nCertificate Information:") print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}") print(f"Issuer: {cert.issuer.get('CN', '')}")
@@ -529,67 +533,6 @@ async def ssl_certification():
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
# Speed Comparison
async def speed_comparison():
print("\n--- Speed Comparison ---")
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(scrape_status['markdown'])} characters")
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
print()
# Crawl4AI comparisons
browser_config = BrowserConfig(headless=True)
# Simple crawl
async with AsyncWebCrawler(config=browser_config) as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, word_count_threshold=0
),
)
end = time.time()
print("Crawl4AI (simple crawl):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown)} characters")
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Advanced filtering
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
),
),
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
print()
# Main execution # Main execution
async def main(): async def main():
# Basic examples # Basic examples