refactor(core): replace float('inf') with math.inf
Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code.
No breaking changes.
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
url = r"https://openai.com/api/pricing/"
|
||||
url = "https://openai.com/api/pricing/"
|
||||
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
@@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel):
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
|
||||
async def main():
|
||||
# Use AsyncWebCrawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
|
||||
@@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
|
||||
|
||||
|
||||
async def cosine_similarity_extraction():
|
||||
from crawl4ai.extraction_strategy import CosineStrategy
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=CosineStrategy(
|
||||
@@ -507,6 +508,9 @@ async def ssl_certification():
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
tmp_dir = os.path.join(__location__, "tmp")
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
@@ -529,67 +533,6 @@ async def ssl_certification():
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
|
||||
# Speed Comparison
|
||||
async def speed_comparison():
|
||||
print("\n--- Speed Comparison ---")
|
||||
|
||||
# Firecrawl comparison
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print("Firecrawl:")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(scrape_status['markdown'])} characters")
|
||||
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI comparisons
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Simple crawl
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, word_count_threshold=0
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (simple crawl):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Advanced filtering
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=0,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# Basic examples
|
||||
|
||||
Reference in New Issue
Block a user