Compare commits

...

4 Commits

Author SHA1 Message Date
UncleCode
9c58e4ce2e fix(docs): correct section numbering in deepcrawl_example.py tutorial 2025-03-04 20:57:33 +08:00
UncleCode
df6a6d5f4f refactor(docs): reorganize tutorial sections and update wrap-up example 2025-03-04 20:55:09 +08:00
UncleCode
e896c08f9c chore(version): bump version to 0.5.0.post1 2025-03-04 20:29:27 +08:00
UncleCode
56bc3c6e45 refactor(cli): improve CLI default command handling
Make 'crawl' the default command when no command is specified.
This improves user experience by allowing direct URL input without
explicitly specifying the 'crawl' command.

Also removes unnecessary blank lines in example code for better readability.
2025-03-04 20:28:16 +08:00
4 changed files with 90 additions and 89 deletions

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.5.0" __version__ = "0.5.0.post1"

View File

@@ -712,7 +712,7 @@ def profiles_cmd():
# Run interactive profile manager # Run interactive profile manager
anyio.run(manage_profiles) anyio.run(manage_profiles)
@cli.command() @cli.command(name="")
@click.argument("url", required=False) @click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples") @click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") @click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -772,5 +772,11 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
profile=profile profile=profile
) )
def main():
import sys
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
sys.argv.insert(1, "crawl")
cli()
if __name__ == "__main__": if __name__ == "__main__":
cli() main()

View File

@@ -65,7 +65,6 @@ async def basic_deep_crawl():
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds" f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
) )
# 2⃣ Stream vs. Non-Stream Execution # 2⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream(): async def stream_vs_nonstream():
""" """
@@ -127,7 +126,6 @@ async def stream_vs_nonstream():
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds") print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
print("\n🔍 Key Takeaway: Streaming allows processing results immediately") print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
# 3⃣ Introduce Filters & Scorers # 3⃣ Introduce Filters & Scorers
async def filters_and_scorers(): async def filters_and_scorers():
""" """
@@ -236,82 +234,10 @@ async def filters_and_scorers():
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score") print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first") print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
# 4⃣ Advanced Filters
# 4⃣ Wrap-Up and Key Takeaways
async def wrap_up():
"""
PART 4: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
print("Combining filters, scorers, and streaming for an optimized crawl")
# Create a sophisticated filter chain
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
)
# Execute the crawl
results = []
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun(
url="https://docs.crawl4ai.com", config=config
):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
duration = time.perf_counter() - start_time
# Summarize the results
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
print(
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
)
# Group by depth
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
# 5⃣ Advanced Filters
async def advanced_filters(): async def advanced_filters():
""" """
PART 5: Demonstrates advanced filtering techniques for specialized crawling. PART 4: Demonstrates advanced filtering techniques for specialized crawling.
This function covers: This function covers:
- SEO filters - SEO filters
@@ -371,11 +297,10 @@ async def advanced_filters():
relevance_score = result.metadata.get("relevance_score", 0) relevance_score = result.metadata.get("relevance_score", 0)
print(f" → Score: {relevance_score:.2f} | {result.url}") print(f" → Score: {relevance_score:.2f} | {result.url}")
# 5⃣ Max Pages and Score Thresholds
# Main function to run the entire tutorial
async def max_pages_and_thresholds(): async def max_pages_and_thresholds():
""" """
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies. PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
This function shows: This function shows:
- How to limit the number of pages crawled - How to limit the number of pages crawled
@@ -471,6 +396,77 @@ async def max_pages_and_thresholds():
print(f" ✅ Average score: {avg_score:.2f}") print(f" ✅ Average score: {avg_score:.2f}")
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first") print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
# 6⃣ Wrap-Up and Key Takeaways
async def wrap_up():
"""
PART 6: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
print("Combining filters, scorers, and streaming for an optimized crawl")
# Create a sophisticated filter chain
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
)
# Execute the crawl
results = []
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun(
url="https://docs.crawl4ai.com", config=config
):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
duration = time.perf_counter() - start_time
# Summarize the results
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
print(
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
)
# Group by depth
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
async def run_tutorial(): async def run_tutorial():
""" """
Executes all tutorial sections in sequence. Executes all tutorial sections in sequence.
@@ -482,12 +478,12 @@ async def run_tutorial():
# Define sections - uncomment to run specific parts during development # Define sections - uncomment to run specific parts during development
tutorial_sections = [ tutorial_sections = [
# basic_deep_crawl, basic_deep_crawl,
# stream_vs_nonstream, stream_vs_nonstream,
# filters_and_scorers, filters_and_scorers,
max_pages_and_thresholds, # Added new section max_pages_and_thresholds,
wrap_up,
advanced_filters, advanced_filters,
wrap_up,
] ]
for section in tutorial_sections: for section in tutorial_sections:
@@ -497,7 +493,6 @@ async def run_tutorial():
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.") print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
print("For more information, check out https://docs.crawl4ai.com") print("For more information, check out https://docs.crawl4ai.com")
# Execute the tutorial when run directly # Execute the tutorial when run directly
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(run_tutorial()) asyncio.run(run_tutorial())

View File

@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
crawl4ai-migrate = "crawl4ai.migrations:main" crawl4ai-migrate = "crawl4ai.migrations:main"
crawl4ai-setup = "crawl4ai.install:post_install" crawl4ai-setup = "crawl4ai.install:post_install"
crawl4ai-doctor = "crawl4ai.install:doctor" crawl4ai-doctor = "crawl4ai.install:doctor"
crwl = "crawl4ai.cli:cli" crwl = "crawl4ai.cli:main"
[tool.setuptools] [tool.setuptools]
packages = {find = {where = ["."], include = ["crawl4ai*"]}} packages = {find = {where = ["."], include = ["crawl4ai*"]}}