"Claude Code Review workflow"

"Claude PR Assistant workflow"
2025-10-04 12:45:20 +08:00 · 2025-10-04 12:45:18 +08:00
12 changed files with 152 additions and 281 deletions
--- a/.github/workflows/claude-code-review.yml
+++ b/.github/workflows/claude-code-review.yml
@@ -0,0 +1,57 @@
+name: Claude Code Review
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+    # Optional: Only run on specific file changes
+    # paths:
+    #   - "src/**/*.ts"
+    #   - "src/**/*.tsx"
+    #   - "src/**/*.js"
+    #   - "src/**/*.jsx"
+
+jobs:
+  claude-review:
+    # Optional: Filter by PR author
+    # if: |
+    #   github.event.pull_request.user.login == 'external-contributor' ||
+    #   github.event.pull_request.user.login == 'new-developer' ||
+    #   github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR'
+    
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+      issues: read
+      id-token: write
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Run Claude Code Review
+        id: claude-review
+        uses: anthropics/claude-code-action@v1
+        with:
+          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+          prompt: |
+            REPO: ${{ github.repository }}
+            PR NUMBER: ${{ github.event.pull_request.number }}
+
+            Please review this pull request and provide feedback on:
+            - Code quality and best practices
+            - Potential bugs or issues
+            - Performance considerations
+            - Security concerns
+            - Test coverage
+            
+            Use the repository's CLAUDE.md for guidance on style and conventions. Be constructive and helpful in your feedback.
+
+            Use `gh pr comment` with your Bash tool to leave your review as a comment on the PR.
+          
+          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
+          # or https://docs.claude.com/en/docs/claude-code/sdk#command-line for available options
+          claude_args: '--allowed-tools "Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)"'
+
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -0,0 +1,50 @@
+name: Claude Code
+
+on:
+  issue_comment:
+    types: [created]
+  pull_request_review_comment:
+    types: [created]
+  issues:
+    types: [opened, assigned]
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  claude:
+    if: |
+      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
+      (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+      issues: read
+      id-token: write
+      actions: read # Required for Claude to read CI results on PRs
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Run Claude Code
+        id: claude
+        uses: anthropics/claude-code-action@v1
+        with:
+          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+          
+          # This is an optional setting that allows Claude to read CI results on PRs
+          additional_permissions: |
+            actions: read
+
+          # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it.
+          # prompt: 'Update the pull request description to include a summary of changes.'
+
+          # Optional: Add claude_args to customize behavior and configuration
+          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
+          # or https://docs.claude.com/en/docs/claude-code/sdk#command-line for available options
+          # claude_args: '--model claude-opus-4-1-20250805 --allowed-tools Bash(gh pr:*)'
+
--- a/README.md
+++ b/README.md
@@ -373,7 +373,7 @@ async def main():
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://docs.micronaut.io/4.9.9/guide/",
+            url="https://docs.micronaut.io/4.7.6/guide/",
            config=run_config
        )
        print(len(result.markdown.raw_markdown))
@@ -425,7 +425,7 @@ async def main():
            "type": "attribute",
            "attribute": "src"
        }
-    ]
+    }
 }

    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -97,16 +97,13 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
                if value != param.default and not ignore_default_value:
                    current_values[name] = to_serializable_dict(value)
        
-        # Don't serialize private __slots__ - they're internal implementation details
-        # not constructor parameters. This was causing URLPatternFilter to fail
-        # because _simple_suffixes was being serialized as 'simple_suffixes'
-        # if hasattr(obj, '__slots__'):
-        #     for slot in obj.__slots__:
-        #         if slot.startswith('_'):  # Handle private slots
-        #             attr_name = slot[1:]  # Remove leading '_'
-        #             value = getattr(obj, slot, None)
-        #             if value is not None:
-        #                 current_values[attr_name] = to_serializable_dict(value)
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)

            
        
--- a/crawl4ai/async_crawler_strategy.back.py
+++ b/crawl4ai/async_crawler_strategy.back.py
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            except Error:
                visibility_info = await self.check_visibility(page)

-                if self.browser_config.verbose:
+                if self.browser_config.config.verbose:
                    self.logger.debug(
                        message="Body visibility info: {info}",
                        tag="DEBUG",
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -47,13 +47,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        self.url_scorer = url_scorer
        self.include_external = include_external
        self.max_pages = max_pages
-        # self.logger = logger or logging.getLogger(__name__)
-        # Ensure logger is always a Logger instance, not a dict from serialization
-        if isinstance(logger, logging.Logger):
-            self.logger = logger
-        else:
-            # Create a new logger if logger is None, dict, or any other non-Logger type
-            self.logger = logging.getLogger(__name__)
+        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -38,13 +38,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        self.include_external = include_external
        self.score_threshold = score_threshold
        self.max_pages = max_pages
-        # self.logger = logger or logging.getLogger(__name__)
-        # Ensure logger is always a Logger instance, not a dict from serialization
-        if isinstance(logger, logging.Logger):
-            self.logger = logger
-        else:
-            # Create a new logger if logger is None, dict, or any other non-Logger type
-            self.logger = logging.getLogger(__name__)
+        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -120,9 +120,6 @@ class URLPatternFilter(URLFilter):
    """Pattern filter balancing speed and completeness"""

    __slots__ = (
-        "patterns",  # Store original patterns for serialization
-        "use_glob",  # Store original use_glob for serialization  
-        "reverse",   # Store original reverse for serialization
        "_simple_suffixes",
        "_simple_prefixes",
        "_domain_patterns",
@@ -145,11 +142,6 @@ class URLPatternFilter(URLFilter):
        reverse: bool = False,
    ):
        super().__init__()
-        # Store original constructor params for serialization
-        self.patterns = patterns
-        self.use_glob = use_glob
-        self.reverse = reverse
-        
        self._reverse = reverse
        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns

--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -253,16 +253,6 @@ class CrawlResult(BaseModel):
        requirements change, this is where you would update the logic.
        """
        result = super().model_dump(*args, **kwargs)
-        
-        # Remove any property descriptors that might have been included
-        # These deprecated properties should not be in the serialized output
-        for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
-            if key in result and isinstance(result[key], property):
-                # del result[key]
-                # Nasrin: I decided to convert it to string instead of removing it.
-                result[key] = str(result[key])
-        
-        # Add the markdown field properly
        if self._markdown is not None:
            result["markdown"] = self._markdown.model_dump() 
        return result
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -4,7 +4,7 @@ import asyncio
 from typing import List, Tuple, Dict
 from functools import partial
 from uuid import uuid4
-from datetime import datetime, timezone
+from datetime import datetime
 from base64 import b64encode

 import logging
@@ -576,7 +576,7 @@ async def handle_crawl_job(
    task_id = f"crawl_{uuid4().hex[:8]}"
    await redis.hset(f"task:{task_id}", mapping={
        "status": TaskStatus.PROCESSING,         # <-- keep enum values consistent
-        "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
+        "created_at": datetime.utcnow().isoformat(),
        "url": json.dumps(urls),                 # store list as JSON string
        "result": "",
        "error": "",
--- a/docs/md_v2/core/url-seeding.md
+++ b/docs/md_v2/core/url-seeding.md
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
    
    # Step 2: Configure discovery - let's find all blog posts
    config = SeedingConfig(
-        source="sitemap+cc",      # Use the website's sitemap+cc
-        pattern="*/courses/*",    # Only courses related posts
+        source="sitemap",           # Use the website's sitemap
+        pattern="*/blog/*.html",    # Only blog posts
        extract_head=True,          # Get page metadata
        max_urls=100               # Limit for this example
    )
    
    # Step 3: Discover URLs from the Python blog
-    print("🔍 Discovering course posts...")
+    print("🔍 Discovering blog posts...")
    urls = await seeder.urls("realpython.com", config)
-    print(f"✅ Found {len(urls)} course posts")
+    print(f"✅ Found {len(urls)} blog posts")
    
    # Step 4: Filter for Python tutorials (using metadata!)
    tutorials = [
@@ -134,8 +134,7 @@ async def smart_blog_crawler():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            only_text=True,
-            word_count_threshold=300,  # Only substantial articles
-            stream=True
+            word_count_threshold=300  # Only substantial articles
        )
        
        # Extract URLs and crawl them
@@ -156,7 +155,7 @@ asyncio.run(smart_blog_crawler())

 **What just happened?**

-1. We discovered all blog URLs from the sitemap+cc
+1. We discovered all blog URLs from the sitemap
 2. We filtered using metadata (no crawling needed!)
 3. We crawled only the relevant tutorials
 4. We saved tons of time and bandwidth
@@ -283,8 +282,8 @@ config = SeedingConfig(
    live_check=True,  # Verify each URL is accessible
    concurrency=20    # Check 20 URLs in parallel
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("example.com", config)
+
+urls = await seeder.urls("example.com", config)

 # Now you can filter by status
 live_urls = [u for u in urls if u["status"] == "valid"]
@@ -312,8 +311,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
 config = SeedingConfig(
    extract_head=True  # Extract metadata from <head> section
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("example.com", config)
+
+urls = await seeder.urls("example.com", config)

 # Now each URL has rich metadata
 for url in urls[:3]:
@@ -388,8 +387,8 @@ config = SeedingConfig(
    scoring_method="bm25",
    score_threshold=0.3
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("example.com", config)
+
+urls = await seeder.urls("example.com", config)

 # URLs are scored based on:
 # 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -430,8 +429,8 @@ config = SeedingConfig(
    extract_head=True,
    live_check=True
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("blog.example.com", config)
+
+urls = await seeder.urls("blog.example.com", config)

 # Analyze the results
 for url in urls[:5]:
@@ -489,8 +488,8 @@ config = SeedingConfig(
    scoring_method="bm25",       # Use BM25 algorithm
    score_threshold=0.3          # Minimum relevance score
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("realpython.com", config)
+
+urls = await seeder.urls("realpython.com", config)

 # Results are automatically sorted by relevance!
 for url in urls[:5]:
@@ -512,8 +511,8 @@ config = SeedingConfig(
    score_threshold=0.5,
    max_urls=20
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("docs.example.com", config)
+
+urls = await seeder.urls("docs.example.com", config)

 # The highest scoring URLs will be API docs!
 ```
@@ -530,8 +529,8 @@ config = SeedingConfig(
    score_threshold=0.4,
    pattern="*/product/*"  # Combine with pattern matching
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("shop.example.com", config)
+
+urls = await seeder.urls("shop.example.com", config)

 # Filter further by price (from metadata)
 affordable = [
@@ -551,8 +550,8 @@ config = SeedingConfig(
    scoring_method="bm25",
    score_threshold=0.35
 )
-async with AsyncUrlSeeder() as seeder:
-    urls = await seeder.urls("technews.com", config)
+
+urls = await seeder.urls("technews.com", config)

 # Filter by date
 from datetime import datetime, timedelta
@@ -592,8 +591,8 @@ for query in queries:
        score_threshold=0.4,
        max_urls=10  # Top 10 per topic
    )
-    async with AsyncUrlSeeder() as seeder:
-        urls = await seeder.urls("learning-platform.com", config)
+    
+    urls = await seeder.urls("learning-platform.com", config)
    all_tutorials.extend(urls)

 # Remove duplicates while preserving order
@@ -626,8 +625,7 @@ config = SeedingConfig(
 )

 # Returns a dictionary: {domain: [urls]}
-async with AsyncUrlSeeder() as seeder:
-    results = await seeder.many_urls(domains, config)
+results = await seeder.many_urls(domains, config)

 # Process results
 for domain, urls in results.items():
@@ -656,8 +654,8 @@ config = SeedingConfig(
    pattern="*/blog/*",
    max_urls=100
 )
-async with AsyncUrlSeeder() as seeder:
-    results = await seeder.many_urls(competitors, config)
+
+results = await seeder.many_urls(competitors, config)

 # Analyze content types
 for domain, urls in results.items():
@@ -692,8 +690,8 @@ config = SeedingConfig(
    score_threshold=0.3,
    max_urls=20  # Per site
 )
-async with AsyncUrlSeeder() as seeder:
-    results = await seeder.many_urls(educational_sites, config)
+
+results = await seeder.many_urls(educational_sites, config)

 # Find the best beginner tutorials
 all_tutorials = []
@@ -733,8 +731,8 @@ config = SeedingConfig(
    score_threshold=0.5,  # High threshold for relevance
    max_urls=10
 )
-async with AsyncUrlSeeder() as seeder:
-    results = await seeder.many_urls(news_sites, config)
+
+results = await seeder.many_urls(news_sites, config)

 # Collect all mentions
 mentions = []
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -1,201 +0,0 @@
-"""
-Test the complete fix for both the filter serialization and JSON serialization issues.
-"""
-
-import asyncio
-import httpx
-
-from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
-
-BASE_URL = "http://localhost:11234/"  # Adjust port as needed
-
-async def test_with_docker_client():
-    """Test using the Docker client (same as 1419.py)."""
-    from crawl4ai.docker_client import Crawl4aiDockerClient
-    
-    print("=" * 60)
-    print("Testing with Docker Client")
-    print("=" * 60)
-    
-    try:
-        async with Crawl4aiDockerClient(
-            base_url=BASE_URL,
-            verbose=True,
-        ) as client:
-            
-            # Create filter chain - testing the serialization fix
-            filter_chain = [
-                URLPatternFilter(
-                    # patterns=["*about*", "*privacy*", "*terms*"],
-                    patterns=["*advanced*"],
-                    reverse=True
-                ),
-            ]
-            
-            crawler_config = CrawlerRunConfig(
-                deep_crawl_strategy=BFSDeepCrawlStrategy(
-                    max_depth=2,  # Keep it shallow for testing
-                    # max_pages=5,  # Limit pages for testing
-                    filter_chain=FilterChain(filter_chain)
-                ),
-                cache_mode=CacheMode.BYPASS,
-            )
-            
-            print("\n1. Testing crawl with filters...")
-            results = await client.crawl(
-                ["https://docs.crawl4ai.com"],  # Simple test page
-                browser_config=BrowserConfig(headless=True),
-                crawler_config=crawler_config,
-            )
-            
-            if results:
-                print(f"✅ Crawl succeeded! Type: {type(results)}")
-                if hasattr(results, 'success'):
-                    print(f"✅ Results success: {results.success}")
-                    # Test that we can iterate results without JSON errors
-                    if hasattr(results, '__iter__'):
-                        for i, result in enumerate(results):
-                            if hasattr(result, 'url'):
-                                print(f"   Result {i}: {result.url[:50]}...")
-                            else:
-                                print(f"   Result {i}: {str(result)[:50]}...")
-                else:
-                    # Handle list of results
-                    print(f"✅ Got {len(results)} results")
-                    for i, result in enumerate(results[:3]):  # Show first 3
-                        print(f"   Result {i}: {result.url[:50]}...")
-            else:
-                print("❌ Crawl failed - no results returned")
-                return False
-                
-        print("\n✅ Docker client test completed successfully!")
-        return True
-        
-    except Exception as e:
-        print(f"❌ Docker client test failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-async def test_with_rest_api():
-    """Test using REST API directly."""
-    print("\n" + "=" * 60)
-    print("Testing with REST API")
-    print("=" * 60)
-    
-    # Create filter configuration
-    deep_crawl_strategy_payload = {
-        "type": "BFSDeepCrawlStrategy",
-        "params": {
-            "max_depth": 2,
-            # "max_pages": 5,
-            "filter_chain": {
-                "type": "FilterChain",
-                "params": {
-                    "filters": [
-                        {
-                            "type": "URLPatternFilter",
-                            "params": {
-                                "patterns": ["*advanced*"],
-                                "reverse": True
-                            }
-                        }
-                    ]
-                }
-            }
-        }
-    }
-    
-    crawl_payload = {
-        "urls": ["https://docs.crawl4ai.com"],
-        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-        "crawler_config": {
-            "type": "CrawlerRunConfig",
-            "params": {
-                "deep_crawl_strategy": deep_crawl_strategy_payload,
-                "cache_mode": "bypass"
-            }
-        }
-    }
-    
-    try:
-        async with httpx.AsyncClient() as client:
-            print("\n1. Sending crawl request to REST API...")
-            response = await client.post(
-                f"{BASE_URL}crawl",
-                json=crawl_payload,
-                timeout=30
-            )
-            
-            if response.status_code == 200:
-                print(f"✅ REST API returned 200 OK")
-                data = response.json()
-                if data.get("success"):
-                    results = data.get("results", [])
-                    print(f"✅ Got {len(results)} results")
-                    for i, result in enumerate(results[:3]):
-                        print(f"   Result {i}: {result.get('url', 'unknown')[:50]}...")
-                else:
-                    print(f"❌ Crawl not successful: {data}")
-                    return False
-            else:
-                print(f"❌ REST API returned {response.status_code}")
-                print(f"   Response: {response.text[:500]}")
-                return False
-                
-        print("\n✅ REST API test completed successfully!")
-        return True
-        
-    except Exception as e:
-        print(f"❌ REST API test failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-async def main():
-    """Run all tests."""
-    print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
-    print("=" * 60)
-    print("Make sure the server is running with the updated code!")
-    print("=" * 60)
-    
-    results = []
-    
-    # Test 1: Docker client
-    docker_passed = await test_with_docker_client()
-    results.append(("Docker Client", docker_passed))
-    
-    # Test 2: REST API
-    rest_passed = await test_with_rest_api()
-    results.append(("REST API", rest_passed))
-    
-    # Summary
-    print("\n" + "=" * 60)
-    print("FINAL TEST SUMMARY")
-    print("=" * 60)
-    
-    all_passed = True
-    for test_name, passed in results:
-        status = "✅ PASSED" if passed else "❌ FAILED"
-        print(f"{test_name:20} {status}")
-        if not passed:
-            all_passed = False
-    
-    print("=" * 60)
-    if all_passed:
-        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
-        print("\nThe fixes:")
-        print("1. Filter serialization: Fixed by not serializing private __slots__")
-        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
-    else:
-        print("⚠️ Some tests failed. Please check the server logs for details.")
-    
-    return 0 if all_passed else 1
-
-
-if __name__ == "__main__":
-    import sys
-    sys.exit(asyncio.run(main()))
Author	SHA1	Message	Date
UncleCode	2e0dd7ffb9	"Claude Code Review workflow"	2025-10-04 12:45:20 +08:00
UncleCode	2bbcb1dc7d	"Claude PR Assistant workflow"	2025-10-04 12:45:18 +08:00