2025 feb alpha 1 (#685)

* spelling change in prompt

* gpt-4o-mini support

* Remove leading Y before here

* prompt spell correction

* (Docs) Fix numbered list end-of-line formatting

Added the missing "two spaces" to add a line break

* fix: access downloads_path through browser_config in _handle_download method - Fixes #585

* crawl

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/583

* Docs update: https://github.com/unclecode/crawl4ai/issues/649

* fix: https://github.com/unclecode/crawl4ai/issues/570

* Docs: updated example for content-selection to reflect new changes in yc newsfeed css

* Refactor: Removed old filters and replaced with optimised filters

* fix:Fixed imports as per the new names of filters

* Tests: For deep crawl filters

* Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers.

* fix: awaiting on filters that are async in nature eg: content relevance and seo filters

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/715

---------

Co-authored-by: DarshanTank <darshan.tank@gnani.ai>
Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com>
Co-authored-by: Serhat Soydan <ssoydan@gmail.com>
Co-authored-by: cardit1 <maneesh@cardit.in>
Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
This commit is contained in:
Aravind
2025-02-19 11:43:17 +05:30
committed by GitHub
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions

View File

@@ -0,0 +1,46 @@
import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
filter_chain = FilterChain([
URLPatternFilter(patterns=["*2025*"]),
DomainFilter(allowed_domains=["techcrunch.com"]),
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
])
config = CrawlerRunConfig(
deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://techcrunch.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,279 @@
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
async def test_pattern_filter():
# Test cases as list of tuples instead of dict for multiple patterns
test_cases = [
# Simple suffix patterns (*.html)
("*.html", {
"https://example.com/page.html": True,
"https://example.com/path/doc.html": True,
"https://example.com/page.htm": False,
"https://example.com/page.html?param=1": True,
}),
# Path prefix patterns (/foo/*)
("*/article/*", {
"https://example.com/article/123": True,
"https://example.com/blog/article/456": True,
"https://example.com/articles/789": False,
"https://example.com/article": False,
}),
# Complex patterns
("blog-*-[0-9]", {
"https://example.com/blog-post-1": True,
"https://example.com/blog-test-9": True,
"https://example.com/blog-post": False,
"https://example.com/blog-post-x": False,
}),
# Multiple patterns case
(["*.pdf", "*/download/*"], {
"https://example.com/doc.pdf": True,
"https://example.com/download/file.txt": True,
"https://example.com/path/download/doc": True,
"https://example.com/uploads/file.txt": False,
}),
# Edge cases
("*", {
"https://example.com": True,
"": True,
"http://test.com/path": True,
}),
# Complex regex
(r"^https?://.*\.example\.com/\d+", {
"https://sub.example.com/123": True,
"http://test.example.com/456": True,
"https://example.com/789": False,
"https://sub.example.com/abc": False,
})
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for patterns, test_urls in test_cases:
filter_obj = URLPatternFilter(patterns)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
return all_passed
# Run tests
print("Running Pattern Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n✨ All accuracy tests passed!")
else:
print("\n❌ Some accuracy tests failed!")
async def test_domain_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed domains
({"allowed": "example.com"}, {
"https://example.com/page": True,
"http://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
({"allowed": ["example.com", "test.com"]}, {
"https://example.com/page": True,
"https://test.com/home": True,
"https://other.com": False,
}),
# Blocked domains
({"blocked": "malicious.com"}, {
"https://malicious.com": False,
"https://safe.com": True,
"http://malicious.com/login": False,
}),
({"blocked": ["spam.com", "ads.com"]}, {
"https://spam.com": False,
"https://ads.com/banner": False,
"https://example.com": True,
}),
# Allowed and Blocked combination
({"allowed": "example.com", "blocked": "sub.example.com"}, {
"https://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = DomainFilter(
allowed_domains=params.get("allowed"),
blocked_domains=params.get("blocked"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Domain Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_content_relevance_filter():
relevance_filter = ContentRelevanceFilter(
query="What was the cause of american civil war?",
threshold=1
)
test_cases = {
"https://en.wikipedia.org/wiki/Cricket": False,
"https://en.wikipedia.org/wiki/American_Civil_War": True,
}
print("\nRunning Content Relevance Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await relevance_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All content relevance tests passed!")
else:
print("\n\u274C Some content relevance tests failed!")
async def test_content_type_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed single type
({"allowed": "image/png"}, {
"https://example.com/image.png": True,
"https://example.com/photo.jpg": False,
"https://example.com/document.pdf": False,
}),
# Multiple allowed types
({"allowed": ["image/jpeg", "application/pdf"]}, {
"https://example.com/photo.jpg": True,
"https://example.com/document.pdf": True,
"https://example.com/script.js": False,
}),
# No extension should be allowed
({"allowed": "application/json"}, {
"https://example.com/api/data": True,
"https://example.com/data.json": True,
"https://example.com/page.html": False,
}),
# Unknown extensions should not be allowed
({"allowed": "application/octet-stream"}, {
"https://example.com/file.unknown": True,
"https://example.com/archive.zip": False,
"https://example.com/software.exe": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = ContentTypeFilter(
allowed_types=params.get("allowed"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Content Type Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_seo_filter():
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
test_cases = {
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
"https://en.wikipedia.org/wiki/Randomness": False,
}
print("\nRunning SEO Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await seo_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All SEO filter tests passed!")
else:
print("\n\u274C Some SEO filter tests failed!")
import asyncio
if __name__ == "__main__":
asyncio.run(test_pattern_filter())
asyncio.run(test_domain_filter())
asyncio.run(test_content_type_filter())
asyncio.run(test_content_relevance_filter())
asyncio.run(test_seo_filter())

View File

@@ -0,0 +1,179 @@
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
def test_scorers():
test_cases = [
# Keyword Scorer Tests
{
"scorer_type": "keyword",
"config": {
"keywords": ["python", "blog"],
"weight": 1.0,
"case_sensitive": False
},
"urls": {
"https://example.com/python-blog": 1.0,
"https://example.com/PYTHON-BLOG": 1.0,
"https://example.com/python-only": 0.5,
"https://example.com/other": 0.0
}
},
# Path Depth Scorer Tests
{
"scorer_type": "path_depth",
"config": {
"optimal_depth": 2,
"weight": 1.0
},
"urls": {
"https://example.com/a/b": 1.0,
"https://example.com/a": 0.5,
"https://example.com/a/b/c": 0.5,
"https://example.com": 0.33333333
}
},
# Content Type Scorer Tests
{
"scorer_type": "content_type",
"config": {
"type_weights": {
".html$": 1.0,
".pdf$": 0.8,
".jpg$": 0.6
},
"weight": 1.0
},
"urls": {
"https://example.com/doc.html": 1.0,
"https://example.com/doc.pdf": 0.8,
"https://example.com/img.jpg": 0.6,
"https://example.com/other.txt": 0.0
}
},
# Freshness Scorer Tests
{
"scorer_type": "freshness",
"config": {
"weight": 1.0, # Remove current_year since original doesn't support it
},
"urls": {
"https://example.com/2024/01/post": 1.0,
"https://example.com/2023/12/post": 0.9,
"https://example.com/2022/post": 0.8,
"https://example.com/no-date": 0.5
}
},
# Domain Authority Scorer Tests
{
"scorer_type": "domain",
"config": {
"domain_weights": {
"python.org": 1.0,
"github.com": 0.8,
"medium.com": 0.6
},
"default_weight": 0.3,
"weight": 1.0
},
"urls": {
"https://python.org/about": 1.0,
"https://github.com/repo": 0.8,
"https://medium.com/post": 0.6,
"https://unknown.com": 0.3
}
}
]
def create_scorer(scorer_type, config):
if scorer_type == "keyword":
return KeywordRelevanceScorer(**config)
elif scorer_type == "path_depth":
return PathDepthScorer(**config)
elif scorer_type == "content_type":
return ContentTypeScorer(**config)
elif scorer_type == "freshness":
return FreshnessScorer(**config,current_year=2024)
elif scorer_type == "domain":
return DomainAuthorityScorer(**config)
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for test_case in test_cases:
print(f"\nTesting {test_case['scorer_type']} scorer:")
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
for url, expected in test_case['urls'].items():
score = round(scorer.score(url), 8)
expected = round(expected, 8)
if abs(score - expected) > 0.00001:
print(f"❌ Scorer Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Scorer Passed: URL '{url}'")
return all_passed
def run_composite_test():
print("\nTesting Composite Scorer:")
print("-" * 50)
# Create test data
test_urls = {
"https://python.org/blog/2024/01/new-release.html":0.86666667,
"https://github.com/repo/old-code.pdf": 0.62,
"https://unknown.com/random": 0.26
}
# Create composite scorers with all types
scorers = []
for test_case in test_cases:
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
scorers.append(scorer)
composite = CompositeScorer(scorers, normalize=True)
all_passed = True
for url, expected in test_urls.items():
score = round(composite.score(url), 8)
if abs(score - expected) > 0.00001:
print(f"❌ Composite Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Composite Passed: URL '{url}'")
return all_passed
# Run tests
print("Running Scorer Tests...")
accuracy_passed = run_accuracy_test()
composite_passed = run_composite_test()
if accuracy_passed and composite_passed:
print("\n✨ All tests passed!")
# Note: Already have performance tests in run_scorer_performance_test()
else:
print("\n❌ Some tests failed!")
if __name__ == "__main__":
test_scorers()