2025 feb alpha 1 (#685)

* spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
2025-02-19 11:43:17 +05:30
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions
--- a/tests/20241401/test_advanced_deep_crawl.py
+++ b/tests/20241401/test_advanced_deep_crawl.py
@@ -0,0 +1,46 @@
+import asyncio
+import time
+
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+
+
+async def main():
+    """Example deep crawl of documentation site."""
+    filter_chain = FilterChain([
+        URLPatternFilter(patterns=["*2025*"]),
+        DomainFilter(allowed_domains=["techcrunch.com"]),
+        ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
+        ContentTypeFilter(allowed_types=["text/html","application/javascript"])
+    ])
+    config = CrawlerRunConfig(
+        deep_crawl_strategy = BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
+        ),
+        stream=False,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        print("Starting deep crawl in streaming mode:")
+        config.stream = True
+        start_time = time.perf_counter()
+        async for result in await crawler.arun(
+            url="https://techcrunch.com",
+            config=config
+        ):
+            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
+        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/20241401/test_deep_crawl_filters.py
+++ b/tests/20241401/test_deep_crawl_filters.py
@@ -0,0 +1,279 @@
+from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
+async def test_pattern_filter():
+    # Test cases as list of tuples instead of dict for multiple patterns
+    test_cases = [
+        # Simple suffix patterns (*.html)
+        ("*.html", {
+            "https://example.com/page.html": True,
+            "https://example.com/path/doc.html": True,
+            "https://example.com/page.htm": False,
+            "https://example.com/page.html?param=1": True,
+        }),
+        
+        # Path prefix patterns (/foo/*)
+        ("*/article/*", {
+            "https://example.com/article/123": True,
+            "https://example.com/blog/article/456": True,
+            "https://example.com/articles/789": False,
+            "https://example.com/article": False,
+        }),
+        
+        # Complex patterns
+        ("blog-*-[0-9]", {
+            "https://example.com/blog-post-1": True,
+            "https://example.com/blog-test-9": True,
+            "https://example.com/blog-post": False,
+            "https://example.com/blog-post-x": False,
+        }),
+        
+        # Multiple patterns case
+        (["*.pdf", "*/download/*"], {
+            "https://example.com/doc.pdf": True,
+            "https://example.com/download/file.txt": True,
+            "https://example.com/path/download/doc": True,
+            "https://example.com/uploads/file.txt": False,
+        }),
+        
+        # Edge cases
+        ("*", {
+            "https://example.com": True,
+            "": True,
+            "http://test.com/path": True,
+        }),
+        
+        # Complex regex
+        (r"^https?://.*\.example\.com/\d+", {
+            "https://sub.example.com/123": True,
+            "http://test.example.com/456": True,
+            "https://example.com/789": False,
+            "https://sub.example.com/abc": False,
+        })
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for patterns, test_urls in test_cases:
+            filter_obj = URLPatternFilter(patterns)
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Pattern Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n✨ All accuracy tests passed!")
+        
+    else:
+        print("\n❌ Some accuracy tests failed!")
+
+async def test_domain_filter():
+    from itertools import chain
+
+    # Test cases
+    test_cases = [
+        # Allowed domains
+        ({"allowed": "example.com"}, {
+            "https://example.com/page": True,
+            "http://example.com": True,
+            "https://sub.example.com": False,
+            "https://other.com": False,
+        }),
+
+        ({"allowed": ["example.com", "test.com"]}, {
+            "https://example.com/page": True,
+            "https://test.com/home": True,
+            "https://other.com": False,
+        }),
+
+        # Blocked domains
+        ({"blocked": "malicious.com"}, {
+            "https://malicious.com": False,
+            "https://safe.com": True,
+            "http://malicious.com/login": False,
+        }),
+
+        ({"blocked": ["spam.com", "ads.com"]}, {
+            "https://spam.com": False,
+            "https://ads.com/banner": False,
+            "https://example.com": True,
+        }),
+
+        # Allowed and Blocked combination
+        ({"allowed": "example.com", "blocked": "sub.example.com"}, {
+            "https://example.com": True,
+            "https://sub.example.com": False,
+            "https://other.com": False,
+        }),
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for params, test_urls in test_cases:
+            filter_obj = DomainFilter(
+                allowed_domains=params.get("allowed"),
+                blocked_domains=params.get("blocked"),
+            )
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"\u274C Failed: Params {params} with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"\u2705 Passed: Params {params} with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Domain Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n\u2728 All accuracy tests passed!")
+    else:
+        print("\n\u274C Some accuracy tests failed!")
+
+async def test_content_relevance_filter():
+    relevance_filter = ContentRelevanceFilter(
+        query="What was the cause of american civil war?", 
+        threshold=1
+    )
+
+    test_cases = {
+        "https://en.wikipedia.org/wiki/Cricket": False,
+        "https://en.wikipedia.org/wiki/American_Civil_War": True,
+    }
+
+    print("\nRunning Content Relevance Filter Tests...")
+    print("-" * 50)
+    
+    all_passed = True
+    for url, expected in test_cases.items():
+        result = await relevance_filter.apply(url)
+        if result != expected:
+            print(f"\u274C Failed: URL '{url}'")
+            print(f"   Expected: {expected}, Got: {result}")
+            all_passed = False
+        else:
+            print(f"\u2705 Passed: URL '{url}'")
+    
+    if all_passed:
+        print("\n\u2728 All content relevance tests passed!")
+    else:
+        print("\n\u274C Some content relevance tests failed!")
+
+async def test_content_type_filter():
+    from itertools import chain
+
+    # Test cases
+    test_cases = [
+        # Allowed single type
+        ({"allowed": "image/png"}, {
+            "https://example.com/image.png": True,
+            "https://example.com/photo.jpg": False,
+            "https://example.com/document.pdf": False,
+        }),
+
+        # Multiple allowed types
+        ({"allowed": ["image/jpeg", "application/pdf"]}, {
+            "https://example.com/photo.jpg": True,
+            "https://example.com/document.pdf": True,
+            "https://example.com/script.js": False,
+        }),
+
+        # No extension should be allowed
+        ({"allowed": "application/json"}, {
+            "https://example.com/api/data": True,
+            "https://example.com/data.json": True,
+            "https://example.com/page.html": False,
+        }),
+
+        # Unknown extensions should not be allowed
+        ({"allowed": "application/octet-stream"}, {
+            "https://example.com/file.unknown": True,
+            "https://example.com/archive.zip": False,
+            "https://example.com/software.exe": False,
+        }),
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for params, test_urls in test_cases:
+            filter_obj = ContentTypeFilter(
+                allowed_types=params.get("allowed"),
+            )
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"\u274C Failed: Params {params} with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"\u2705 Passed: Params {params} with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Content Type Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n\u2728 All accuracy tests passed!")
+    else:
+        print("\n\u274C Some accuracy tests failed!")
+
+async def test_seo_filter():
+    seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
+
+    test_cases = {
+        "https://en.wikipedia.org/wiki/Search_engine_optimization": True,
+        "https://en.wikipedia.org/wiki/Randomness": False,
+    }
+
+    print("\nRunning SEO Filter Tests...")
+    print("-" * 50)
+    
+    all_passed = True
+    for url, expected in test_cases.items():
+        result = await seo_filter.apply(url)
+        if result != expected:
+            print(f"\u274C Failed: URL '{url}'")
+            print(f"   Expected: {expected}, Got: {result}")
+            all_passed = False
+        else:
+            print(f"\u2705 Passed: URL '{url}'")
+    
+    if all_passed:
+        print("\n\u2728 All SEO filter tests passed!")
+    else:
+        print("\n\u274C Some SEO filter tests failed!")
+
+import asyncio
+
+if __name__ == "__main__":
+    asyncio.run(test_pattern_filter())
+    asyncio.run(test_domain_filter())
+    asyncio.run(test_content_type_filter())
+    asyncio.run(test_content_relevance_filter())
+    asyncio.run(test_seo_filter())
--- a/tests/20241401/test_deep_crawl_scorers.py
+++ b/tests/20241401/test_deep_crawl_scorers.py
@@ -0,0 +1,179 @@
+from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
+
+
+def test_scorers():
+    test_cases = [
+        # Keyword Scorer Tests
+        {
+            "scorer_type": "keyword",
+            "config": {
+                "keywords": ["python", "blog"],
+                "weight": 1.0,
+                "case_sensitive": False
+            },
+            "urls": {
+                "https://example.com/python-blog": 1.0,
+                "https://example.com/PYTHON-BLOG": 1.0,
+                "https://example.com/python-only": 0.5,
+                "https://example.com/other": 0.0
+            }
+        },
+        
+        # Path Depth Scorer Tests
+        {
+            "scorer_type": "path_depth",
+            "config": {
+                "optimal_depth": 2,
+                "weight": 1.0
+            },
+            "urls": {
+                "https://example.com/a/b": 1.0,
+                "https://example.com/a": 0.5,
+                "https://example.com/a/b/c": 0.5,
+                "https://example.com": 0.33333333
+            }
+        },
+        
+        # Content Type Scorer Tests
+        {
+            "scorer_type": "content_type",
+            "config": {
+                "type_weights": {
+                    ".html$": 1.0,
+                    ".pdf$": 0.8,
+                    ".jpg$": 0.6
+                },
+                "weight": 1.0
+            },
+            "urls": {
+                "https://example.com/doc.html": 1.0,
+                "https://example.com/doc.pdf": 0.8,
+                "https://example.com/img.jpg": 0.6,
+                "https://example.com/other.txt": 0.0
+            }
+        },
+        
+        # Freshness Scorer Tests
+        {
+            "scorer_type": "freshness",
+            "config": {
+                "weight": 1.0,  # Remove current_year since original doesn't support it
+            },
+            "urls": {
+                "https://example.com/2024/01/post": 1.0,
+                "https://example.com/2023/12/post": 0.9,
+                "https://example.com/2022/post": 0.8,
+                "https://example.com/no-date": 0.5
+            }
+        },
+        
+        # Domain Authority Scorer Tests
+        {
+            "scorer_type": "domain",
+            "config": {
+                "domain_weights": {
+                    "python.org": 1.0,
+                    "github.com": 0.8,
+                    "medium.com": 0.6
+                },
+                "default_weight": 0.3,
+                "weight": 1.0
+            },
+            "urls": {
+                "https://python.org/about": 1.0,
+                "https://github.com/repo": 0.8,
+                "https://medium.com/post": 0.6,
+                "https://unknown.com": 0.3
+            }
+        }
+    ]
+
+    def create_scorer(scorer_type, config):
+        if scorer_type == "keyword":
+            return KeywordRelevanceScorer(**config)
+        elif scorer_type == "path_depth":
+            return PathDepthScorer(**config)
+        elif scorer_type == "content_type":
+            return ContentTypeScorer(**config)
+        elif scorer_type == "freshness":
+            return FreshnessScorer(**config,current_year=2024)
+        elif scorer_type == "domain":
+            return DomainAuthorityScorer(**config)
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for test_case in test_cases:
+            print(f"\nTesting {test_case['scorer_type']} scorer:")
+            scorer = create_scorer(
+                test_case['scorer_type'],
+                test_case['config']
+            )
+            
+            for url, expected in test_case['urls'].items():
+                score = round(scorer.score(url), 8)
+                expected = round(expected, 8)
+                
+                if abs(score - expected) > 0.00001:
+                    print(f"❌ Scorer Failed: URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {score}")
+                    all_passed = False
+                else:
+                    print(f"✅ Scorer Passed: URL '{url}'")
+                    
+                    
+        return all_passed
+
+    def run_composite_test():
+        print("\nTesting Composite Scorer:")
+        print("-" * 50)
+        
+        # Create test data
+        test_urls = {
+            "https://python.org/blog/2024/01/new-release.html":0.86666667,
+            "https://github.com/repo/old-code.pdf": 0.62,
+            "https://unknown.com/random": 0.26
+        }
+        
+        # Create composite scorers with all types
+        scorers = []
+        
+        for test_case in test_cases:
+            scorer = create_scorer(
+                test_case['scorer_type'],
+                test_case['config']
+            )
+            scorers.append(scorer)
+            
+        composite = CompositeScorer(scorers, normalize=True)
+        
+        all_passed = True
+        for url, expected in test_urls.items():
+            score = round(composite.score(url), 8)
+            
+            if abs(score - expected) > 0.00001:
+                print(f"❌ Composite Failed: URL '{url}'")
+                print(f"   Expected: {expected}, Got: {score}")
+                all_passed = False
+            else:
+                print(f"✅ Composite Passed: URL '{url}'")
+                
+        return all_passed
+
+    # Run tests
+    print("Running Scorer Tests...")
+    accuracy_passed = run_accuracy_test()
+    composite_passed = run_composite_test()
+    
+    if accuracy_passed and composite_passed:
+        print("\n✨ All tests passed!")
+        # Note: Already have performance tests in run_scorer_performance_test()
+    else:
+        print("\n❌ Some tests failed!")
+
+    
+
+if __name__ == "__main__":
+    test_scorers()