Update config.yml

2025-03-05 14:15:57 +08:00 · 2025-03-05 12:51:07 +08:00
5 changed files with 92 additions and 93 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post1"
+__version__ = "0.5.0"
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -712,7 +712,7 @@ def profiles_cmd():
    # Run interactive profile manager
    anyio.run(manage_profiles)
-@cli.command(name="")
+@cli.command()
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -772,11 +772,5 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        profile=profile
    )
 def main():
    import sys
    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
        sys.argv.insert(1, "crawl")
    cli()
 if __name__ == "__main__":
-    main()
+    cli()
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -38,8 +38,8 @@ rate_limiting:
 # Security Configuration
 security:
-  enabled: true 
+  enabled: false 
-  jwt_enabled: true 
+  jwt_enabled: false 
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
--- a/docs/examples/deepcrawl_example.py
+++ b/docs/examples/deepcrawl_example.py
@@ -65,6 +65,7 @@ async def basic_deep_crawl():
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )
 # 2️⃣ Stream vs. Non-Stream Execution
 async def stream_vs_nonstream():
    """
@@ -126,6 +127,7 @@ async def stream_vs_nonstream():
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
 # 3️⃣ Introduce Filters & Scorers
 async def filters_and_scorers():
    """
@@ -234,10 +236,82 @@ async def filters_and_scorers():
        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
-# 4️⃣ Advanced Filters
+
 # 4️⃣ Wrap-Up and Key Takeaways
 async def wrap_up():
    """
    PART 4: Wrap-Up and Key Takeaways
    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")
    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )
    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )
    # Execute the crawl
    results = []
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
    duration = time.perf_counter() - start_time
    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )
    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1
    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
 # 5️⃣ Advanced Filters
 async def advanced_filters():
    """
-    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+    PART 5: Demonstrates advanced filtering techniques for specialized crawling.
    This function covers:
    - SEO filters
@@ -297,10 +371,11 @@ async def advanced_filters():
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")
-# 5️⃣ Max Pages and Score Thresholds
+
 # Main function to run the entire tutorial
 async def max_pages_and_thresholds():
    """
-    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
    This function shows:
    - How to limit the number of pages crawled
@@ -396,77 +471,6 @@ async def max_pages_and_thresholds():
            print(f"  ✅ Average score: {avg_score:.2f}")
            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
 # 6️⃣ Wrap-Up and Key Takeaways
 async def wrap_up():
    """
    PART 6: Wrap-Up and Key Takeaways
    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")
    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )
    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )
    # Execute the crawl
    results = []
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
    duration = time.perf_counter() - start_time
    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )
    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1
    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")
 async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
@@ -478,12 +482,12 @@ async def run_tutorial():
    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
-        basic_deep_crawl,
+        # basic_deep_crawl,
-        stream_vs_nonstream,
+        # stream_vs_nonstream,
-        filters_and_scorers,
+        # filters_and_scorers,
-        max_pages_and_thresholds, 
+        max_pages_and_thresholds,  # Added new section
        advanced_filters,
        wrap_up,
        advanced_filters,
    ]
    for section in tutorial_sections:
@@ -493,6 +497,7 @@ async def run_tutorial():
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")
 # Execute the tutorial when run directly
 if __name__ == "__main__":
    asyncio.run(run_tutorial())
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
 crawl4ai-migrate = "crawl4ai.migrations:main"
 crawl4ai-setup = "crawl4ai.install:post_install"
 crawl4ai-doctor = "crawl4ai.install:doctor"
-crwl = "crawl4ai.cli:main"
+crwl = "crawl4ai.cli:cli"
 [tool.setuptools]
 packages = {find = {where = ["."], include = ["crawl4ai*"]}}
Author	SHA1	Message	Date
UncleCode	e12d2e29e5	Update config.yml	2025-03-05 14:15:57 +08:00
UncleCode	fc425023f5	Update config.yml	2025-03-05 12:51:07 +08:00
`@@ -1,2 +1,2 @@`
	`# crawl4ai/_version.py`	`# crawl4ai/_version.py`
	`__version__ = "0.5.0.post1"`	`__version__ = "0.5.0"`