Update config.yml

2025-03-05 14:15:57 +08:00 · 2025-03-05 12:51:07 +08:00
5 changed files with 92 additions and 93 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post1"
+__version__ = "0.5.0"
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -712,7 +712,7 @@ def profiles_cmd():
    # Run interactive profile manager
    anyio.run(manage_profiles)

-@cli.command(name="")
+@cli.command()
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -772,11 +772,5 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        profile=profile
    )

-def main():
-    import sys
-    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
-        sys.argv.insert(1, "crawl")
-    cli()
-
 if __name__ == "__main__":
-    main()
+    cli()
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -38,8 +38,8 @@ rate_limiting:

 # Security Configuration
 security:
-  enabled: true 
-  jwt_enabled: true 
+  enabled: false 
+  jwt_enabled: false 
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
@@ -68,4 +68,4 @@ observability:
    enabled: True
    endpoint: "/metrics"
  health_check:
-    endpoint: "/health"
+    endpoint: "/health"
--- a/docs/examples/deepcrawl_example.py
+++ b/docs/examples/deepcrawl_example.py
@@ -65,6 +65,7 @@ async def basic_deep_crawl():
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )

+
 # 2️⃣ Stream vs. Non-Stream Execution
 async def stream_vs_nonstream():
    """
@@ -126,6 +127,7 @@ async def stream_vs_nonstream():
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")

+
 # 3️⃣ Introduce Filters & Scorers
 async def filters_and_scorers():
    """
@@ -234,10 +236,82 @@ async def filters_and_scorers():
        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")

-# 4️⃣ Advanced Filters
+
+# 4️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 4: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+
+# 5️⃣ Advanced Filters
 async def advanced_filters():
    """
-    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+    PART 5: Demonstrates advanced filtering techniques for specialized crawling.

    This function covers:
    - SEO filters
@@ -297,10 +371,11 @@ async def advanced_filters():
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")

-# 5️⃣ Max Pages and Score Thresholds
+
+# Main function to run the entire tutorial
 async def max_pages_and_thresholds():
    """
-    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
    
    This function shows:
    - How to limit the number of pages crawled
@@ -396,77 +471,6 @@ async def max_pages_and_thresholds():
            print(f"  ✅ Average score: {avg_score:.2f}")
            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")

-# 6️⃣ Wrap-Up and Key Takeaways
-async def wrap_up():
-    """
-    PART 6: Wrap-Up and Key Takeaways
-
-    Summarize the key concepts learned in this tutorial.
-    """
-    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
-    print("Combining filters, scorers, and streaming for an optimized crawl")
-
-    # Create a sophisticated filter chain
-    filter_chain = FilterChain(
-        [
-            DomainFilter(
-                allowed_domains=["docs.crawl4ai.com"],
-                blocked_domains=["old.docs.crawl4ai.com"],
-            ),
-            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
-            ContentTypeFilter(allowed_types=["text/html"]),
-        ]
-    )
-
-    # Create a composite scorer that combines multiple scoring strategies
-    keyword_scorer = KeywordRelevanceScorer(
-        keywords=["crawl", "example", "async", "configuration"], weight=0.7
-    )
-    # Set up the configuration
-    config = CrawlerRunConfig(
-        deep_crawl_strategy=BestFirstCrawlingStrategy(
-            max_depth=1,
-            include_external=False,
-            filter_chain=filter_chain,
-            url_scorer=keyword_scorer,
-        ),
-        scraping_strategy=LXMLWebScrapingStrategy(),
-        stream=True,
-        verbose=True,
-    )
-
-    # Execute the crawl
-    results = []
-    start_time = time.perf_counter()
-
-    async with AsyncWebCrawler() as crawler:
-        async for result in await crawler.arun(
-            url="https://docs.crawl4ai.com", config=config
-        ):
-            results.append(result)
-            score = result.metadata.get("score", 0)
-            depth = result.metadata.get("depth", 0)
-            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
-
-    duration = time.perf_counter() - start_time
-
-    # Summarize the results
-    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
-    print(
-        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
-    )
-
-    # Group by depth
-    depth_counts = {}
-    for result in results:
-        depth = result.metadata.get("depth", 0)
-        depth_counts[depth] = depth_counts.get(depth, 0) + 1
-
-    print("\n📊 Pages crawled by depth:")
-    for depth, count in sorted(depth_counts.items()):
-        print(f"  Depth {depth}: {count} pages")
-
-
 async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
@@ -478,12 +482,12 @@ async def run_tutorial():

    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
-        basic_deep_crawl,
-        stream_vs_nonstream,
-        filters_and_scorers,
-        max_pages_and_thresholds, 
-        advanced_filters,
+        # basic_deep_crawl,
+        # stream_vs_nonstream,
+        # filters_and_scorers,
+        max_pages_and_thresholds,  # Added new section
        wrap_up,
+        advanced_filters,
    ]

    for section in tutorial_sections:
@@ -493,6 +497,7 @@ async def run_tutorial():
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")

+
 # Execute the tutorial when run directly
 if __name__ == "__main__":
    asyncio.run(run_tutorial())
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
 crawl4ai-migrate = "crawl4ai.migrations:main"
 crawl4ai-setup = "crawl4ai.install:post_install"
 crawl4ai-doctor = "crawl4ai.install:doctor"
-crwl = "crawl4ai.cli:main"
+crwl = "crawl4ai.cli:cli"

 [tool.setuptools]
 packages = {find = {where = ["."], include = ["crawl4ai*"]}}
Author	SHA1	Message	Date
UncleCode	e12d2e29e5	Update config.yml	2025-03-05 14:15:57 +08:00
UncleCode	fc425023f5	Update config.yml	2025-03-05 12:51:07 +08:00