Compare commits

..

2 Commits

Author SHA1 Message Date
UncleCode
e12d2e29e5 Update config.yml 2025-03-05 14:15:57 +08:00
UncleCode
fc425023f5 Update config.yml 2025-03-05 12:51:07 +08:00
5 changed files with 92 additions and 93 deletions

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.5.0.post1"
__version__ = "0.5.0"

View File

@@ -712,7 +712,7 @@ def profiles_cmd():
# Run interactive profile manager
anyio.run(manage_profiles)
@cli.command(name="")
@cli.command()
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -772,11 +772,5 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
profile=profile
)
def main():
import sys
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
sys.argv.insert(1, "crawl")
cli()
if __name__ == "__main__":
main()
cli()

View File

@@ -38,8 +38,8 @@ rate_limiting:
# Security Configuration
security:
enabled: true
jwt_enabled: true
enabled: false
jwt_enabled: false
https_redirect: false
trusted_hosts: ["*"]
headers:
@@ -68,4 +68,4 @@ observability:
enabled: True
endpoint: "/metrics"
health_check:
endpoint: "/health"
endpoint: "/health"

View File

@@ -65,6 +65,7 @@ async def basic_deep_crawl():
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
)
# 2⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream():
"""
@@ -126,6 +127,7 @@ async def stream_vs_nonstream():
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
# 3⃣ Introduce Filters & Scorers
async def filters_and_scorers():
"""
@@ -234,10 +236,82 @@ async def filters_and_scorers():
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
# 4⃣ Advanced Filters
# 4⃣ Wrap-Up and Key Takeaways
async def wrap_up():
"""
PART 4: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
print("Combining filters, scorers, and streaming for an optimized crawl")
# Create a sophisticated filter chain
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
)
# Execute the crawl
results = []
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun(
url="https://docs.crawl4ai.com", config=config
):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
duration = time.perf_counter() - start_time
# Summarize the results
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
print(
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
)
# Group by depth
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
# 5⃣ Advanced Filters
async def advanced_filters():
"""
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
This function covers:
- SEO filters
@@ -297,10 +371,11 @@ async def advanced_filters():
relevance_score = result.metadata.get("relevance_score", 0)
print(f" → Score: {relevance_score:.2f} | {result.url}")
# 5⃣ Max Pages and Score Thresholds
# Main function to run the entire tutorial
async def max_pages_and_thresholds():
"""
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
This function shows:
- How to limit the number of pages crawled
@@ -396,77 +471,6 @@ async def max_pages_and_thresholds():
print(f" ✅ Average score: {avg_score:.2f}")
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
# 6⃣ Wrap-Up and Key Takeaways
async def wrap_up():
"""
PART 6: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
print("Combining filters, scorers, and streaming for an optimized crawl")
# Create a sophisticated filter chain
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
)
# Execute the crawl
results = []
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun(
url="https://docs.crawl4ai.com", config=config
):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
duration = time.perf_counter() - start_time
# Summarize the results
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
print(
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
)
# Group by depth
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
async def run_tutorial():
"""
Executes all tutorial sections in sequence.
@@ -478,12 +482,12 @@ async def run_tutorial():
# Define sections - uncomment to run specific parts during development
tutorial_sections = [
basic_deep_crawl,
stream_vs_nonstream,
filters_and_scorers,
max_pages_and_thresholds,
advanced_filters,
# basic_deep_crawl,
# stream_vs_nonstream,
# filters_and_scorers,
max_pages_and_thresholds, # Added new section
wrap_up,
advanced_filters,
]
for section in tutorial_sections:
@@ -493,6 +497,7 @@ async def run_tutorial():
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
print("For more information, check out https://docs.crawl4ai.com")
# Execute the tutorial when run directly
if __name__ == "__main__":
asyncio.run(run_tutorial())

View File

@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
crawl4ai-migrate = "crawl4ai.migrations:main"
crawl4ai-setup = "crawl4ai.install:post_install"
crawl4ai-doctor = "crawl4ai.install:doctor"
crwl = "crawl4ai.cli:main"
crwl = "crawl4ai.cli:cli"
[tool.setuptools]
packages = {find = {where = ["."], include = ["crawl4ai*"]}}