2025 feb alpha 1 (#685)
* spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
This commit is contained in:
46
tests/20241401/test_advanced_deep_crawl.py
Normal file
46
tests/20241401/test_advanced_deep_crawl.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
"""Example deep crawl of documentation site."""
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter(patterns=["*2025*"]),
|
||||
DomainFilter(allowed_domains=["techcrunch.com"]),
|
||||
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
|
||||
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
|
||||
])
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
|
||||
),
|
||||
stream=False,
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
print("Starting deep crawl in streaming mode:")
|
||||
config.stream = True
|
||||
start_time = time.perf_counter()
|
||||
async for result in await crawler.arun(
|
||||
url="https://techcrunch.com",
|
||||
config=config
|
||||
):
|
||||
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
279
tests/20241401/test_deep_crawl_filters.py
Normal file
279
tests/20241401/test_deep_crawl_filters.py
Normal file
@@ -0,0 +1,279 @@
|
||||
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
|
||||
async def test_pattern_filter():
|
||||
# Test cases as list of tuples instead of dict for multiple patterns
|
||||
test_cases = [
|
||||
# Simple suffix patterns (*.html)
|
||||
("*.html", {
|
||||
"https://example.com/page.html": True,
|
||||
"https://example.com/path/doc.html": True,
|
||||
"https://example.com/page.htm": False,
|
||||
"https://example.com/page.html?param=1": True,
|
||||
}),
|
||||
|
||||
# Path prefix patterns (/foo/*)
|
||||
("*/article/*", {
|
||||
"https://example.com/article/123": True,
|
||||
"https://example.com/blog/article/456": True,
|
||||
"https://example.com/articles/789": False,
|
||||
"https://example.com/article": False,
|
||||
}),
|
||||
|
||||
# Complex patterns
|
||||
("blog-*-[0-9]", {
|
||||
"https://example.com/blog-post-1": True,
|
||||
"https://example.com/blog-test-9": True,
|
||||
"https://example.com/blog-post": False,
|
||||
"https://example.com/blog-post-x": False,
|
||||
}),
|
||||
|
||||
# Multiple patterns case
|
||||
(["*.pdf", "*/download/*"], {
|
||||
"https://example.com/doc.pdf": True,
|
||||
"https://example.com/download/file.txt": True,
|
||||
"https://example.com/path/download/doc": True,
|
||||
"https://example.com/uploads/file.txt": False,
|
||||
}),
|
||||
|
||||
# Edge cases
|
||||
("*", {
|
||||
"https://example.com": True,
|
||||
"": True,
|
||||
"http://test.com/path": True,
|
||||
}),
|
||||
|
||||
# Complex regex
|
||||
(r"^https?://.*\.example\.com/\d+", {
|
||||
"https://sub.example.com/123": True,
|
||||
"http://test.example.com/456": True,
|
||||
"https://example.com/789": False,
|
||||
"https://sub.example.com/abc": False,
|
||||
})
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for patterns, test_urls in test_cases:
|
||||
filter_obj = URLPatternFilter(patterns)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Pattern Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n✨ All accuracy tests passed!")
|
||||
|
||||
else:
|
||||
print("\n❌ Some accuracy tests failed!")
|
||||
|
||||
async def test_domain_filter():
|
||||
from itertools import chain
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Allowed domains
|
||||
({"allowed": "example.com"}, {
|
||||
"https://example.com/page": True,
|
||||
"http://example.com": True,
|
||||
"https://sub.example.com": False,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
|
||||
({"allowed": ["example.com", "test.com"]}, {
|
||||
"https://example.com/page": True,
|
||||
"https://test.com/home": True,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
|
||||
# Blocked domains
|
||||
({"blocked": "malicious.com"}, {
|
||||
"https://malicious.com": False,
|
||||
"https://safe.com": True,
|
||||
"http://malicious.com/login": False,
|
||||
}),
|
||||
|
||||
({"blocked": ["spam.com", "ads.com"]}, {
|
||||
"https://spam.com": False,
|
||||
"https://ads.com/banner": False,
|
||||
"https://example.com": True,
|
||||
}),
|
||||
|
||||
# Allowed and Blocked combination
|
||||
({"allowed": "example.com", "blocked": "sub.example.com"}, {
|
||||
"https://example.com": True,
|
||||
"https://sub.example.com": False,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for params, test_urls in test_cases:
|
||||
filter_obj = DomainFilter(
|
||||
allowed_domains=params.get("allowed"),
|
||||
blocked_domains=params.get("blocked"),
|
||||
)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Domain Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n\u2728 All accuracy tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some accuracy tests failed!")
|
||||
|
||||
async def test_content_relevance_filter():
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="What was the cause of american civil war?",
|
||||
threshold=1
|
||||
)
|
||||
|
||||
test_cases = {
|
||||
"https://en.wikipedia.org/wiki/Cricket": False,
|
||||
"https://en.wikipedia.org/wiki/American_Civil_War": True,
|
||||
}
|
||||
|
||||
print("\nRunning Content Relevance Filter Tests...")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_cases.items():
|
||||
result = await relevance_filter.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: URL '{url}'")
|
||||
|
||||
if all_passed:
|
||||
print("\n\u2728 All content relevance tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some content relevance tests failed!")
|
||||
|
||||
async def test_content_type_filter():
|
||||
from itertools import chain
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Allowed single type
|
||||
({"allowed": "image/png"}, {
|
||||
"https://example.com/image.png": True,
|
||||
"https://example.com/photo.jpg": False,
|
||||
"https://example.com/document.pdf": False,
|
||||
}),
|
||||
|
||||
# Multiple allowed types
|
||||
({"allowed": ["image/jpeg", "application/pdf"]}, {
|
||||
"https://example.com/photo.jpg": True,
|
||||
"https://example.com/document.pdf": True,
|
||||
"https://example.com/script.js": False,
|
||||
}),
|
||||
|
||||
# No extension should be allowed
|
||||
({"allowed": "application/json"}, {
|
||||
"https://example.com/api/data": True,
|
||||
"https://example.com/data.json": True,
|
||||
"https://example.com/page.html": False,
|
||||
}),
|
||||
|
||||
# Unknown extensions should not be allowed
|
||||
({"allowed": "application/octet-stream"}, {
|
||||
"https://example.com/file.unknown": True,
|
||||
"https://example.com/archive.zip": False,
|
||||
"https://example.com/software.exe": False,
|
||||
}),
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for params, test_urls in test_cases:
|
||||
filter_obj = ContentTypeFilter(
|
||||
allowed_types=params.get("allowed"),
|
||||
)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Content Type Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n\u2728 All accuracy tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some accuracy tests failed!")
|
||||
|
||||
async def test_seo_filter():
|
||||
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
|
||||
|
||||
test_cases = {
|
||||
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
|
||||
"https://en.wikipedia.org/wiki/Randomness": False,
|
||||
}
|
||||
|
||||
print("\nRunning SEO Filter Tests...")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_cases.items():
|
||||
result = await seo_filter.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: URL '{url}'")
|
||||
|
||||
if all_passed:
|
||||
print("\n\u2728 All SEO filter tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some SEO filter tests failed!")
|
||||
|
||||
import asyncio
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_pattern_filter())
|
||||
asyncio.run(test_domain_filter())
|
||||
asyncio.run(test_content_type_filter())
|
||||
asyncio.run(test_content_relevance_filter())
|
||||
asyncio.run(test_seo_filter())
|
||||
179
tests/20241401/test_deep_crawl_scorers.py
Normal file
179
tests/20241401/test_deep_crawl_scorers.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
|
||||
|
||||
|
||||
def test_scorers():
|
||||
test_cases = [
|
||||
# Keyword Scorer Tests
|
||||
{
|
||||
"scorer_type": "keyword",
|
||||
"config": {
|
||||
"keywords": ["python", "blog"],
|
||||
"weight": 1.0,
|
||||
"case_sensitive": False
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/python-blog": 1.0,
|
||||
"https://example.com/PYTHON-BLOG": 1.0,
|
||||
"https://example.com/python-only": 0.5,
|
||||
"https://example.com/other": 0.0
|
||||
}
|
||||
},
|
||||
|
||||
# Path Depth Scorer Tests
|
||||
{
|
||||
"scorer_type": "path_depth",
|
||||
"config": {
|
||||
"optimal_depth": 2,
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/a/b": 1.0,
|
||||
"https://example.com/a": 0.5,
|
||||
"https://example.com/a/b/c": 0.5,
|
||||
"https://example.com": 0.33333333
|
||||
}
|
||||
},
|
||||
|
||||
# Content Type Scorer Tests
|
||||
{
|
||||
"scorer_type": "content_type",
|
||||
"config": {
|
||||
"type_weights": {
|
||||
".html$": 1.0,
|
||||
".pdf$": 0.8,
|
||||
".jpg$": 0.6
|
||||
},
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/doc.html": 1.0,
|
||||
"https://example.com/doc.pdf": 0.8,
|
||||
"https://example.com/img.jpg": 0.6,
|
||||
"https://example.com/other.txt": 0.0
|
||||
}
|
||||
},
|
||||
|
||||
# Freshness Scorer Tests
|
||||
{
|
||||
"scorer_type": "freshness",
|
||||
"config": {
|
||||
"weight": 1.0, # Remove current_year since original doesn't support it
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/2024/01/post": 1.0,
|
||||
"https://example.com/2023/12/post": 0.9,
|
||||
"https://example.com/2022/post": 0.8,
|
||||
"https://example.com/no-date": 0.5
|
||||
}
|
||||
},
|
||||
|
||||
# Domain Authority Scorer Tests
|
||||
{
|
||||
"scorer_type": "domain",
|
||||
"config": {
|
||||
"domain_weights": {
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.8,
|
||||
"medium.com": 0.6
|
||||
},
|
||||
"default_weight": 0.3,
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://python.org/about": 1.0,
|
||||
"https://github.com/repo": 0.8,
|
||||
"https://medium.com/post": 0.6,
|
||||
"https://unknown.com": 0.3
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
def create_scorer(scorer_type, config):
|
||||
if scorer_type == "keyword":
|
||||
return KeywordRelevanceScorer(**config)
|
||||
elif scorer_type == "path_depth":
|
||||
return PathDepthScorer(**config)
|
||||
elif scorer_type == "content_type":
|
||||
return ContentTypeScorer(**config)
|
||||
elif scorer_type == "freshness":
|
||||
return FreshnessScorer(**config,current_year=2024)
|
||||
elif scorer_type == "domain":
|
||||
return DomainAuthorityScorer(**config)
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for test_case in test_cases:
|
||||
print(f"\nTesting {test_case['scorer_type']} scorer:")
|
||||
scorer = create_scorer(
|
||||
test_case['scorer_type'],
|
||||
test_case['config']
|
||||
)
|
||||
|
||||
for url, expected in test_case['urls'].items():
|
||||
score = round(scorer.score(url), 8)
|
||||
expected = round(expected, 8)
|
||||
|
||||
if abs(score - expected) > 0.00001:
|
||||
print(f"❌ Scorer Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {score}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Scorer Passed: URL '{url}'")
|
||||
|
||||
|
||||
return all_passed
|
||||
|
||||
def run_composite_test():
|
||||
print("\nTesting Composite Scorer:")
|
||||
print("-" * 50)
|
||||
|
||||
# Create test data
|
||||
test_urls = {
|
||||
"https://python.org/blog/2024/01/new-release.html":0.86666667,
|
||||
"https://github.com/repo/old-code.pdf": 0.62,
|
||||
"https://unknown.com/random": 0.26
|
||||
}
|
||||
|
||||
# Create composite scorers with all types
|
||||
scorers = []
|
||||
|
||||
for test_case in test_cases:
|
||||
scorer = create_scorer(
|
||||
test_case['scorer_type'],
|
||||
test_case['config']
|
||||
)
|
||||
scorers.append(scorer)
|
||||
|
||||
composite = CompositeScorer(scorers, normalize=True)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_urls.items():
|
||||
score = round(composite.score(url), 8)
|
||||
|
||||
if abs(score - expected) > 0.00001:
|
||||
print(f"❌ Composite Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {score}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Composite Passed: URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Scorer Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
composite_passed = run_composite_test()
|
||||
|
||||
if accuracy_passed and composite_passed:
|
||||
print("\n✨ All tests passed!")
|
||||
# Note: Already have performance tests in run_scorer_performance_test()
|
||||
else:
|
||||
print("\n❌ Some tests failed!")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scorers()
|
||||
Reference in New Issue
Block a user