fix(docs): correct section numbering in deepcrawl_example.py tutorial

refactor(docs): reorganize tutorial sections and update wrap-up example
chore(version): bump version to 0.5.0.post1
2025-03-04 20:57:33 +08:00 · 2025-03-04 20:55:09 +08:00 · 2025-03-04 20:29:27 +08:00 · 2025-03-04 20:28:16 +08:00
4 changed files with 90 additions and 89 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0"
+__version__ = "0.5.0.post1"
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -712,7 +712,7 @@ def profiles_cmd():
    # Run interactive profile manager
    anyio.run(manage_profiles)
-@cli.command()
+@cli.command(name="")
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -772,5 +772,11 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        profile=profile
    )
 def main():
    import sys
    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
        sys.argv.insert(1, "crawl")
    cli()
 if __name__ == "__main__":
-    cli()
+    main()
--- a/docs/examples/deepcrawl_example.py
+++ b/docs/examples/deepcrawl_example.py
@@ -65,7 +65,6 @@ async def basic_deep_crawl():
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )
 # 2️⃣ Stream vs. Non-Stream Execution
 async def stream_vs_nonstream():
    """
@@ -127,7 +126,6 @@ async def stream_vs_nonstream():
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
 # 3️⃣ Introduce Filters & Scorers
 async def filters_and_scorers():
    """
@@ -236,82 +234,10 @@ async def filters_and_scorers():
        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
-
+# 4️⃣ Advanced Filters
 # 4️⃣ Wrap-Up and Key Takeaways
 async def wrap_up():
    """
    PART 4: Wrap-Up and Key Takeaways
    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")
    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )
    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )
    # Execute the crawl
    results = []
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
    duration = time.perf_counter() - start_time
    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )
    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1
    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
 # 5️⃣ Advanced Filters
 async def advanced_filters():
    """
-    PART 5: Demonstrates advanced filtering techniques for specialized crawling.
+    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
    This function covers:
    - SEO filters
@@ -371,11 +297,10 @@ async def advanced_filters():
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")
-
+# 5️⃣ Max Pages and Score Thresholds
 # Main function to run the entire tutorial
 async def max_pages_and_thresholds():
    """
-    PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
    This function shows:
    - How to limit the number of pages crawled
@@ -471,6 +396,77 @@ async def max_pages_and_thresholds():
            print(f"  ✅ Average score: {avg_score:.2f}")
            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
 # 6️⃣ Wrap-Up and Key Takeaways
 async def wrap_up():
    """
    PART 6: Wrap-Up and Key Takeaways
    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")
    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )
    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )
    # Execute the crawl
    results = []
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
    duration = time.perf_counter() - start_time
    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )
    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1
    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
 async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
@@ -482,12 +478,12 @@ async def run_tutorial():
    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
-        # basic_deep_crawl,
+        basic_deep_crawl,
-        # stream_vs_nonstream,
+        stream_vs_nonstream,
-        # filters_and_scorers,
+        filters_and_scorers,
-        max_pages_and_thresholds,  # Added new section
+        max_pages_and_thresholds, 
        wrap_up,
        advanced_filters,
        wrap_up,
    ]
    for section in tutorial_sections:
@@ -497,7 +493,6 @@ async def run_tutorial():
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")
 # Execute the tutorial when run directly
 if __name__ == "__main__":
    asyncio.run(run_tutorial())
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
 crawl4ai-migrate = "crawl4ai.migrations:main"
 crawl4ai-setup = "crawl4ai.install:post_install"
 crawl4ai-doctor = "crawl4ai.install:doctor"
-crwl = "crawl4ai.cli:cli"
+crwl = "crawl4ai.cli:main"
 [tool.setuptools]
 packages = {find = {where = ["."], include = ["crawl4ai*"]}}
Author	SHA1	Message	Date
UncleCode	9c58e4ce2e	fix(docs): correct section numbering in deepcrawl_example.py tutorial	2025-03-04 20:57:33 +08:00
UncleCode	df6a6d5f4f	refactor(docs): reorganize tutorial sections and update wrap-up example	2025-03-04 20:55:09 +08:00
UncleCode	e896c08f9c	chore(version): bump version to 0.5.0.post1	2025-03-04 20:29:27 +08:00
UncleCode	56bc3c6e45	refactor(cli): improve CLI default command handling Make 'crawl' the default command when no command is specified. This improves user experience by allowing direct URL input without explicitly specifying the 'crawl' command. Also removes unnecessary blank lines in example code for better readability.	2025-03-04 20:28:16 +08:00
`@@ -1,2 +1,2 @@`
	`# crawl4ai/_version.py`	`# crawl4ai/_version.py`
	`__version__ = "0.5.0"`	`__version__ = "0.5.0.post1"`