refactor(core): replace float('inf') with math.inf
Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code.
No breaking changes.
This commit is contained in:
@@ -24,6 +24,8 @@ from urllib.parse import urlparse
|
|||||||
import random
|
import random
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from math import inf as infinity
|
||||||
|
|
||||||
|
|
||||||
class RateLimiter:
|
class RateLimiter:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -250,7 +252,7 @@ class CrawlerMonitor:
|
|||||||
key=lambda x: (
|
key=lambda x: (
|
||||||
x.status != CrawlStatus.IN_PROGRESS,
|
x.status != CrawlStatus.IN_PROGRESS,
|
||||||
x.status != CrawlStatus.QUEUED,
|
x.status != CrawlStatus.QUEUED,
|
||||||
x.end_time or float('inf'),
|
x.end_time or infinity,
|
||||||
),
|
),
|
||||||
)[: self.max_visible_rows]
|
)[: self.max_visible_rows]
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from . import DeepCrawlStrategy
|
|||||||
|
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||||
|
|
||||||
|
from math import inf as infinity
|
||||||
|
|
||||||
# Configurable batch size for processing items from the priority queue
|
# Configurable batch size for processing items from the priority queue
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
@@ -37,7 +38,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
filter_chain: FilterChain = FilterChain(),
|
filter_chain: FilterChain = FilterChain(),
|
||||||
url_scorer: Optional[URLScorer] = None,
|
url_scorer: Optional[URLScorer] = None,
|
||||||
include_external: bool = False,
|
include_external: bool = False,
|
||||||
max_pages: int = float('inf'),
|
max_pages: int = infinity,
|
||||||
logger: Optional[logging.Logger] = None,
|
logger: Optional[logging.Logger] = None,
|
||||||
):
|
):
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from .filters import FilterChain
|
|||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||||
|
from math import inf as infinity
|
||||||
|
|
||||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||||
"""
|
"""
|
||||||
@@ -26,8 +27,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
filter_chain: FilterChain = FilterChain(),
|
filter_chain: FilterChain = FilterChain(),
|
||||||
url_scorer: Optional[URLScorer] = None,
|
url_scorer: Optional[URLScorer] = None,
|
||||||
include_external: bool = False,
|
include_external: bool = False,
|
||||||
score_threshold: float = float('-inf'),
|
score_threshold: float = -infinity,
|
||||||
max_pages: int = float('inf'),
|
max_pages: int = infinity,
|
||||||
logger: Optional[logging.Logger] = None,
|
logger: Optional[logging.Logger] = None,
|
||||||
):
|
):
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.async_configs import LlmConfig
|
||||||
from crawl4ai.extraction_strategy import *
|
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||||
from crawl4ai.crawler_strategy import *
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
|
import json
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
url = r"https://openai.com/api/pricing/"
|
url = "https://openai.com/api/pricing/"
|
||||||
|
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
@@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel):
|
|||||||
..., description="Fee for output token for the OpenAI model."
|
..., description="Fee for output token for the OpenAI model."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Use AsyncWebCrawler
|
# Use AsyncWebCrawler
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
|
|||||||
|
|
||||||
|
|
||||||
async def cosine_similarity_extraction():
|
async def cosine_similarity_extraction():
|
||||||
|
from crawl4ai.extraction_strategy import CosineStrategy
|
||||||
crawl_config = CrawlerRunConfig(
|
crawl_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
@@ -507,6 +508,9 @@ async def ssl_certification():
|
|||||||
if result.success and result.ssl_certificate:
|
if result.success and result.ssl_certificate:
|
||||||
cert = result.ssl_certificate
|
cert = result.ssl_certificate
|
||||||
|
|
||||||
|
tmp_dir = os.path.join(__location__, "tmp")
|
||||||
|
os.makedirs(tmp_dir, exist_ok=True)
|
||||||
|
|
||||||
# 1. Access certificate properties directly
|
# 1. Access certificate properties directly
|
||||||
print("\nCertificate Information:")
|
print("\nCertificate Information:")
|
||||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||||
@@ -529,67 +533,6 @@ async def ssl_certification():
|
|||||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||||
|
|
||||||
|
|
||||||
# Speed Comparison
|
|
||||||
async def speed_comparison():
|
|
||||||
print("\n--- Speed Comparison ---")
|
|
||||||
|
|
||||||
# Firecrawl comparison
|
|
||||||
from firecrawl import FirecrawlApp
|
|
||||||
|
|
||||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
|
||||||
start = time.time()
|
|
||||||
scrape_status = app.scrape_url(
|
|
||||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print("Firecrawl:")
|
|
||||||
print(f"Time taken: {end - start:.2f} seconds")
|
|
||||||
print(f"Content length: {len(scrape_status['markdown'])} characters")
|
|
||||||
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Crawl4AI comparisons
|
|
||||||
browser_config = BrowserConfig(headless=True)
|
|
||||||
|
|
||||||
# Simple crawl
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
start = time.time()
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://www.nbcnews.com/business",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS, word_count_threshold=0
|
|
||||||
),
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print("Crawl4AI (simple crawl):")
|
|
||||||
print(f"Time taken: {end - start:.2f} seconds")
|
|
||||||
print(f"Content length: {len(result.markdown)} characters")
|
|
||||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Advanced filtering
|
|
||||||
start = time.time()
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://www.nbcnews.com/business",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
word_count_threshold=0,
|
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
|
||||||
content_filter=PruningContentFilter(
|
|
||||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
||||||
)
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print("Crawl4AI (Markdown Plus):")
|
|
||||||
print(f"Time taken: {end - start:.2f} seconds")
|
|
||||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
|
||||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
|
||||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
# Main execution
|
# Main execution
|
||||||
async def main():
|
async def main():
|
||||||
# Basic examples
|
# Basic examples
|
||||||
|
|||||||
Reference in New Issue
Block a user