feat(scraper): Enhance URL filtering and scoring systems

Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes.
2024-11-08 19:02:28 +08:00
parent bae4665949
commit 0d357ab7d2
10 changed files with 186 additions and 281 deletions
--- a/crawl4ai/scraper/init.py
+++ b/crawl4ai/scraper/init.py
@@ -1,2 +1,3 @@
 from .async_web_scraper import AsyncWebScraper
-from .bfs_scraper_strategy import BFSScraperStrategy
+from .bfs_scraper_strategy import BFSScraperStrategy
+from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter
--- a/crawl4ai/scraper/filters/init.py
+++ b/crawl4ai/scraper/filters/init.py
--- a/crawl4ai/scraper/filters/content_type_filter.py
+++ b/crawl4ai/scraper/filters/content_type_filter.py
@@ -1,43 +0,0 @@
-from .url_filter import URLFilter
-from typing import List, Union
-from urllib.parse import urlparse
-import mimetypes
-
-
-class ContentTypeFilter(URLFilter):
-    """Filter URLs based on expected content type"""
-    
-    def __init__(self, allowed_types: Union[str, List[str]], 
-                 check_extension: bool = True):
-        super().__init__()
-        self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
-        self.check_extension = check_extension
-        self._normalize_types()
-
-    def _normalize_types(self):
-        """Normalize content type strings"""
-        self.allowed_types = [t.lower() for t in self.allowed_types]
-
-    def _check_extension(self, url: str) -> bool:
-        """Check URL's file extension"""
-        ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
-        if not ext:
-            return True  # No extension, might be dynamic content
-            
-        guessed_type = mimetypes.guess_type(url)[0]
-        return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
-
-    def apply(self, url: str) -> bool:
-        """Check if URL's content type is allowed"""
-        result = True
-        if self.check_extension:
-            result = self._check_extension(url)
-        self._update_stats(result)
-        return result
-
-# class ContentTypeFilter(URLFilter):
-#     def __init__(self, contentType: str):
-#         self.contentType = contentType
-#     def apply(self, url: str) -> bool:
-#         #TODO: This is a stub. Will implement this later
-#         return True
--- a/crawl4ai/scraper/filters/url_filter.py
+++ b/crawl4ai/scraper/filters/url_filter.py
@@ -1,72 +0,0 @@
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-import logging
-from typing import List
-@dataclass
-class FilterStats:
-    """Statistics for filter applications"""
-    total_urls: int = 0
-    rejected_urls: int = 0
-    passed_urls: int = 0
-
-class URLFilter(ABC):
-    """Base class for URL filters"""
-    
-    def __init__(self, name: str = None):
-        self.name = name or self.__class__.__name__
-        self.stats = FilterStats()
-        self.logger = logging.getLogger(f"urlfilter.{self.name}")
-
-    @abstractmethod
-    def apply(self, url: str) -> bool:
-        """Apply the filter to a URL"""
-        pass
-
-    def _update_stats(self, passed: bool):
-        """Update filter statistics"""
-        self.stats.total_urls += 1
-        if passed:
-            self.stats.passed_urls += 1
-        else:
-            self.stats.rejected_urls += 1
-
-class FilterChain:
-    """Chain of URL filters"""
-    
-    def __init__(self, filters: List[URLFilter] = None):
-        self.filters = filters or []
-        self.stats = FilterStats()
-        self.logger = logging.getLogger("urlfilter.chain")
-
-    def add_filter(self, filter_: URLFilter) -> 'FilterChain':
-        """Add a filter to the chain"""
-        self.filters.append(filter_)
-        return self  # Enable method chaining
-
-    def apply(self, url: str) -> bool:
-        """Apply all filters in the chain"""
-        self.stats.total_urls += 1
-        
-        for filter_ in self.filters:
-            if not filter_.apply(url):
-                self.stats.rejected_urls += 1
-                self.logger.debug(f"URL {url} rejected by {filter_.name}")
-                return False
-        
-        self.stats.passed_urls += 1
-        return True
-    
-# class URLFilter(ABC):
-#     @abstractmethod
-#     def apply(self, url: str) -> bool:
-#         pass
-
-# class FilterChain:
-#     def __init__(self):
-#         self.filters = []
-
-#     def add_filter(self, filter: URLFilter):
-#         self.filters.append(filter)
-
-#     def apply(self, url: str) -> bool:
-#         return all(filter.apply(url) for filter in self.filters)
--- a/crawl4ai/scraper/filters/url_pattern_filter.py
+++ b/crawl4ai/scraper/filters/url_pattern_filter.py
@@ -1,39 +0,0 @@
-from .url_filter import URLFilter
-from re import Pattern
-from typing import List, Union
-import re
-import fnmatch
-
-
-class URLPatternFilter(URLFilter):
-    """Filter URLs based on glob patterns or regex"""
-    
-    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], 
-                 use_glob: bool = True):
-        super().__init__()
-        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
-        self.use_glob = use_glob
-        self._compiled_patterns = []
-        
-        for pattern in self.patterns:
-            if isinstance(pattern, str) and use_glob:
-                self._compiled_patterns.append(self._glob_to_regex(pattern))
-            else:
-                self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
-
-    def _glob_to_regex(self, pattern: str) -> Pattern:
-        """Convert glob pattern to regex"""
-        return re.compile(fnmatch.translate(pattern))
-
-    def apply(self, url: str) -> bool:
-        """Check if URL matches any of the patterns"""
-        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
-        self._update_stats(matches)
-        return matches
-
-# class URLPatternFilter(URLFilter):
-#     def __init__(self, pattern: Pattern):
-#         self.pattern = pattern
-#     def apply(self, url: str) -> bool:
-#         #TODO: This is a stub. Will implement this later.
-#         return True
--- a/crawl4ai/scraper/scorers/init.py
+++ b/crawl4ai/scraper/scorers/init.py
--- a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
+++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
@@ -1,9 +0,0 @@
-from .url_scorer import URLScorer
-from typing import List
-
-class KeywordRelevanceScorer(URLScorer):
-    def __init__(self,keywords: List[str]):
-        self.keyworkds = keywords
-    def score(self, url: str) -> float:
-        #TODO: This is a stub. Will implement this later.
-        return 1
--- a/crawl4ai/scraper/scorers/url_scorer.py
+++ b/crawl4ai/scraper/scorers/url_scorer.py
@@ -1,6 +0,0 @@
-from abc import ABC, abstractmethod
-
-class URLScorer(ABC):
-    @abstractmethod
-    def score(self, url: str) -> float:
-        pass
--- a/docs/scrapper/scraper_quickstart.py
+++ b/docs/scrapper/scraper_quickstart.py
@@ -0,0 +1,184 @@
+# basic_scraper_example.py
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def basic_scraper_example():
+    """
+    Basic example: Scrape a blog site for articles
+    - Crawls only HTML pages
+    - Stays within the blog section
+    - Collects all results at once
+    """
+    # Create a simple filter chain
+    filter_chain = FilterChain([
+        # Only crawl pages within the blog section
+        URLPatternFilter("*/blog/*"),
+        # Only process HTML pages
+        ContentTypeFilter(["text/html"])
+    ])
+
+    # Initialize the strategy with basic configuration
+    strategy = BFSScraperStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        max_concurrent=3  # Limit concurrent requests
+    )
+
+    # Create the crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Start scraping
+    try:
+        result = await scraper.ascrape("https://example.com/blog/")
+        
+        # Process results
+        print(f"Crawled {len(result.crawled_urls)} pages:")
+        for url, data in result.extracted_data.items():
+            print(f"- {url}: {len(data.html)} bytes")
+            
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+
+# advanced_scraper_example.py
+import logging
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def advanced_scraper_example():
+    """
+    Advanced example: Intelligent news site scraping
+    - Uses all filter types
+    - Implements sophisticated scoring
+    - Streams results
+    - Includes monitoring and logging
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("advanced_scraper")
+
+    # Create sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain control
+        DomainFilter(
+            allowed_domains=["example.com", "blog.example.com"],
+            blocked_domains=["ads.example.com", "tracker.example.com"]
+        ),
+        # URL patterns
+        URLPatternFilter([
+            "*/article/*",
+            "*/news/*",
+            "*/blog/*",
+            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
+        ]),
+        # Content types
+        ContentTypeFilter([
+            "text/html",
+            "application/xhtml+xml"
+        ])
+    ])
+
+    # Create composite scorer
+    scorer = CompositeScorer([
+        # Prioritize by keywords
+        KeywordRelevanceScorer(
+            keywords=["news", "breaking", "update", "latest"],
+            weight=1.0
+        ),
+        # Prefer optimal URL structure
+        PathDepthScorer(
+            optimal_depth=3,
+            weight=0.7
+        ),
+        # Prioritize fresh content
+        FreshnessScorer(weight=0.9)
+    ])
+
+    # Initialize strategy with advanced configuration
+    strategy = BFSScraperStrategy(
+        max_depth=4,
+        filter_chain=filter_chain,
+        url_scorer=scorer,
+        max_concurrent=5,
+        min_crawl_delay=1
+    )
+
+    # Create crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Track statistics
+    stats = {
+        'processed': 0,
+        'errors': 0,
+        'total_size': 0
+    }
+
+    try:
+        # Use streaming mode
+        async for result in scraper.ascrape("https://example.com/news/", stream=True):
+            stats['processed'] += 1
+            
+            if result.success:
+                stats['total_size'] += len(result.html)
+                logger.info(f"Processed: {result.url}")
+                
+                # Print scoring information
+                for scorer_name, score in result.scores.items():
+                    logger.debug(f"{scorer_name}: {score:.2f}")
+            else:
+                stats['errors'] += 1
+                logger.error(f"Failed to process {result.url}: {result.error_message}")
+
+            # Log progress regularly
+            if stats['processed'] % 10 == 0:
+                logger.info(f"Progress: {stats['processed']} URLs processed")
+
+    except Exception as e:
+        logger.error(f"Scraping error: {e}")
+    
+    finally:
+        # Print final statistics
+        logger.info("Scraping completed:")
+        logger.info(f"- URLs processed: {stats['processed']}")
+        logger.info(f"- Errors: {stats['errors']}")
+        logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
+        
+        # Print filter statistics
+        for filter_ in filter_chain.filters:
+            logger.info(f"{filter_.name} stats:")
+            logger.info(f"- Passed: {filter_.stats.passed_urls}")
+            logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+        
+        # Print scorer statistics
+        logger.info("Scoring statistics:")
+        logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+        logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+
+if __name__ == "__main__":
+    import asyncio
+    
+    # Run basic example
+    print("Running basic scraper example...")
+    asyncio.run(basic_scraper_example())
+    
+    print("\nRunning advanced scraper example...")
+    asyncio.run(advanced_scraper_example())
--- a/docs/scrapper/web_crawler_quick_start.py
+++ b/docs/scrapper/web_crawler_quick_start.py
@@ -1,111 +0,0 @@
-import unittest, os
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
-from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
-
-class TestWebCrawler(unittest.TestCase):
-    
-    def setUp(self):
-        self.crawler = WebCrawler()
-    
-    def test_warmup(self):
-        self.crawler.warmup()
-        self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
-    
-    def test_run_default_strategies(self):
-        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
-            word_count_threshold=5,
-            chunking_strategy=RegexChunking(),
-            extraction_strategy=CosineStrategy(), bypass_cache=True
-        )
-        self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
-    
-    def test_run_different_strategies(self):
-        url = 'https://www.nbcnews.com/business'
-        
-        # Test with FixedLengthWordChunking and LLMExtractionStrategy
-        result = self.crawler.run(
-            url=url,
-            word_count_threshold=5,
-            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
-        )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
-        
-        # Test with SlidingWindowChunking and TopicExtractionStrategy
-        result = self.crawler.run(
-            url=url,
-            word_count_threshold=5,
-            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
-        )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
-    
-    def test_invalid_url(self):
-        with self.assertRaises(Exception) as context:
-            self.crawler.run(url='invalid_url', bypass_cache=True)
-        self.assertIn("Invalid URL", str(context.exception))
-    
-    def test_unsupported_extraction_strategy(self):
-        with self.assertRaises(Exception) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
-        self.assertIn("Unsupported extraction strategy", str(context.exception))
-    
-    def test_invalid_css_selector(self):
-        with self.assertRaises(ValueError) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
-        self.assertIn("Invalid CSS selector", str(context.exception))
-
-    
-    def test_crawl_with_cache_and_bypass_cache(self):
-        url = 'https://www.nbcnews.com/business'
-        
-        # First crawl with cache enabled
-        result = self.crawler.run(url=url, bypass_cache=False)
-        self.assertTrue(result.success, "Failed to crawl and cache the result")
-        
-        # Second crawl with bypass_cache=True
-        result = self.crawler.run(url=url, bypass_cache=True)
-        self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
-    
-    def test_fetch_multiple_pages(self):
-        urls = [
-            'https://www.nbcnews.com/business',
-            'https://www.bbc.com/news'
-        ]
-        results = []
-        for url in urls:
-            result = self.crawler.run(
-                url=url,
-                word_count_threshold=5,
-                chunking_strategy=RegexChunking(),
-                extraction_strategy=CosineStrategy(),
-                bypass_cache=True
-            )
-            results.append(result)
-        
-        self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
-        for result in results:
-            self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
-    
-    def test_run_fixed_length_word_chunking_and_no_extraction(self):
-        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
-            word_count_threshold=5,
-            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
-        )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
-
-    def test_run_sliding_window_and_no_extraction(self):
-        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
-            word_count_threshold=5,
-            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
-        )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
-
-if __name__ == '__main__':
-    unittest.main()