From 0d357ab7d2659596442095f81f36330c9da7627c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 19:02:28 +0800 Subject: [PATCH] feat(scraper): Enhance URL filtering and scoring systems Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. --- crawl4ai/scraper/__init__.py | 3 +- .../{filters/__init__.py => filters.py} | 0 .../scraper/filters/content_type_filter.py | 43 ---- crawl4ai/scraper/filters/url_filter.py | 72 ------- .../scraper/filters/url_pattern_filter.py | 39 ---- .../{scorers/__init__.py => scorers.py} | 0 .../scorers/keyword_relevance_scorer.py | 9 - crawl4ai/scraper/scorers/url_scorer.py | 6 - docs/scrapper/scraper_quickstart.py | 184 ++++++++++++++++++ docs/scrapper/web_crawler_quick_start.py | 111 ----------- 10 files changed, 186 insertions(+), 281 deletions(-) rename crawl4ai/scraper/{filters/__init__.py => filters.py} (100%) delete mode 100644 crawl4ai/scraper/filters/content_type_filter.py delete mode 100644 crawl4ai/scraper/filters/url_filter.py delete mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py rename crawl4ai/scraper/{scorers/__init__.py => scorers.py} (100%) delete mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py delete mode 100644 crawl4ai/scraper/scorers/url_scorer.py create mode 100644 docs/scrapper/scraper_quickstart.py delete mode 100644 docs/scrapper/web_crawler_quick_start.py diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py index 1997e162..1138a917 100644 --- a/crawl4ai/scraper/__init__.py +++ b/crawl4ai/scraper/__init__.py @@ -1,2 +1,3 @@ from .async_web_scraper import AsyncWebScraper -from .bfs_scraper_strategy import BFSScraperStrategy \ No newline at end of file +from .bfs_scraper_strategy import BFSScraperStrategy +from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters.py similarity index 100% rename from crawl4ai/scraper/filters/__init__.py rename to crawl4ai/scraper/filters.py diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py deleted file mode 100644 index 6966afdb..00000000 --- a/crawl4ai/scraper/filters/content_type_filter.py +++ /dev/null @@ -1,43 +0,0 @@ -from .url_filter import URLFilter -from typing import List, Union -from urllib.parse import urlparse -import mimetypes - - -class ContentTypeFilter(URLFilter): - """Filter URLs based on expected content type""" - - def __init__(self, allowed_types: Union[str, List[str]], - check_extension: bool = True): - super().__init__() - self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types - self.check_extension = check_extension - self._normalize_types() - - def _normalize_types(self): - """Normalize content type strings""" - self.allowed_types = [t.lower() for t in self.allowed_types] - - def _check_extension(self, url: str) -> bool: - """Check URL's file extension""" - ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' - if not ext: - return True # No extension, might be dynamic content - - guessed_type = mimetypes.guess_type(url)[0] - return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) - - def apply(self, url: str) -> bool: - """Check if URL's content type is allowed""" - result = True - if self.check_extension: - result = self._check_extension(url) - self._update_stats(result) - return result - -# class ContentTypeFilter(URLFilter): -# def __init__(self, contentType: str): -# self.contentType = contentType -# def apply(self, url: str) -> bool: -# #TODO: This is a stub. Will implement this later -# return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py deleted file mode 100644 index 88a2c60a..00000000 --- a/crawl4ai/scraper/filters/url_filter.py +++ /dev/null @@ -1,72 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -import logging -from typing import List -@dataclass -class FilterStats: - """Statistics for filter applications""" - total_urls: int = 0 - rejected_urls: int = 0 - passed_urls: int = 0 - -class URLFilter(ABC): - """Base class for URL filters""" - - def __init__(self, name: str = None): - self.name = name or self.__class__.__name__ - self.stats = FilterStats() - self.logger = logging.getLogger(f"urlfilter.{self.name}") - - @abstractmethod - def apply(self, url: str) -> bool: - """Apply the filter to a URL""" - pass - - def _update_stats(self, passed: bool): - """Update filter statistics""" - self.stats.total_urls += 1 - if passed: - self.stats.passed_urls += 1 - else: - self.stats.rejected_urls += 1 - -class FilterChain: - """Chain of URL filters""" - - def __init__(self, filters: List[URLFilter] = None): - self.filters = filters or [] - self.stats = FilterStats() - self.logger = logging.getLogger("urlfilter.chain") - - def add_filter(self, filter_: URLFilter) -> 'FilterChain': - """Add a filter to the chain""" - self.filters.append(filter_) - return self # Enable method chaining - - def apply(self, url: str) -> bool: - """Apply all filters in the chain""" - self.stats.total_urls += 1 - - for filter_ in self.filters: - if not filter_.apply(url): - self.stats.rejected_urls += 1 - self.logger.debug(f"URL {url} rejected by {filter_.name}") - return False - - self.stats.passed_urls += 1 - return True - -# class URLFilter(ABC): -# @abstractmethod -# def apply(self, url: str) -> bool: -# pass - -# class FilterChain: -# def __init__(self): -# self.filters = [] - -# def add_filter(self, filter: URLFilter): -# self.filters.append(filter) - -# def apply(self, url: str) -> bool: -# return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py deleted file mode 100644 index 1e02b4a6..00000000 --- a/crawl4ai/scraper/filters/url_pattern_filter.py +++ /dev/null @@ -1,39 +0,0 @@ -from .url_filter import URLFilter -from re import Pattern -from typing import List, Union -import re -import fnmatch - - -class URLPatternFilter(URLFilter): - """Filter URLs based on glob patterns or regex""" - - def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], - use_glob: bool = True): - super().__init__() - self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - self.use_glob = use_glob - self._compiled_patterns = [] - - for pattern in self.patterns: - if isinstance(pattern, str) and use_glob: - self._compiled_patterns.append(self._glob_to_regex(pattern)) - else: - self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) - - def _glob_to_regex(self, pattern: str) -> Pattern: - """Convert glob pattern to regex""" - return re.compile(fnmatch.translate(pattern)) - - def apply(self, url: str) -> bool: - """Check if URL matches any of the patterns""" - matches = any(pattern.search(url) for pattern in self._compiled_patterns) - self._update_stats(matches) - return matches - -# class URLPatternFilter(URLFilter): -# def __init__(self, pattern: Pattern): -# self.pattern = pattern -# def apply(self, url: str) -> bool: -# #TODO: This is a stub. Will implement this later. -# return True \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers.py similarity index 100% rename from crawl4ai/scraper/scorers/__init__.py rename to crawl4ai/scraper/scorers.py diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py deleted file mode 100644 index a2338aec..00000000 --- a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py +++ /dev/null @@ -1,9 +0,0 @@ -from .url_scorer import URLScorer -from typing import List - -class KeywordRelevanceScorer(URLScorer): - def __init__(self,keywords: List[str]): - self.keyworkds = keywords - def score(self, url: str) -> float: - #TODO: This is a stub. Will implement this later. - return 1 \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py deleted file mode 100644 index 6ee9ab05..00000000 --- a/crawl4ai/scraper/scorers/url_scorer.py +++ /dev/null @@ -1,6 +0,0 @@ -from abc import ABC, abstractmethod - -class URLScorer(ABC): - @abstractmethod - def score(self, url: str) -> float: - pass \ No newline at end of file diff --git a/docs/scrapper/scraper_quickstart.py b/docs/scrapper/scraper_quickstart.py new file mode 100644 index 00000000..a2c7a239 --- /dev/null +++ b/docs/scrapper/scraper_quickstart.py @@ -0,0 +1,184 @@ +# basic_scraper_example.py +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def basic_scraper_example(): + """ + Basic example: Scrape a blog site for articles + - Crawls only HTML pages + - Stays within the blog section + - Collects all results at once + """ + # Create a simple filter chain + filter_chain = FilterChain([ + # Only crawl pages within the blog section + URLPatternFilter("*/blog/*"), + # Only process HTML pages + ContentTypeFilter(["text/html"]) + ]) + + # Initialize the strategy with basic configuration + strategy = BFSScraperStrategy( + max_depth=2, # Only go 2 levels deep + filter_chain=filter_chain, + url_scorer=None, # Use default scoring + max_concurrent=3 # Limit concurrent requests + ) + + # Create the crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Start scraping + try: + result = await scraper.ascrape("https://example.com/blog/") + + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") + +# advanced_scraper_example.py +import logging +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter, + DomainFilter, + KeywordRelevanceScorer, + PathDepthScorer, + FreshnessScorer, + CompositeScorer +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def advanced_scraper_example(): + """ + Advanced example: Intelligent news site scraping + - Uses all filter types + - Implements sophisticated scoring + - Streams results + - Includes monitoring and logging + """ + # Set up logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("advanced_scraper") + + # Create sophisticated filter chain + filter_chain = FilterChain([ + # Domain control + DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com", "tracker.example.com"] + ), + # URL patterns + URLPatternFilter([ + "*/article/*", + "*/news/*", + "*/blog/*", + re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs + ]), + # Content types + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]) + ]) + + # Create composite scorer + scorer = CompositeScorer([ + # Prioritize by keywords + KeywordRelevanceScorer( + keywords=["news", "breaking", "update", "latest"], + weight=1.0 + ), + # Prefer optimal URL structure + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + # Prioritize fresh content + FreshnessScorer(weight=0.9) + ]) + + # Initialize strategy with advanced configuration + strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5, + min_crawl_delay=1 + ) + + # Create crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } + + try: + # Use streaming mode + async for result in scraper.ascrape("https://example.com/news/", stream=True): + stats['processed'] += 1 + + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + + # Print scoring information + for scorer_name, score in result.scores.items(): + logger.debug(f"{scorer_name}: {score:.2f}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") + + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") + + except Exception as e: + logger.error(f"Scraping error: {e}") + + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + +if __name__ == "__main__": + import asyncio + + # Run basic example + print("Running basic scraper example...") + asyncio.run(basic_scraper_example()) + + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file diff --git a/docs/scrapper/web_crawler_quick_start.py b/docs/scrapper/web_crawler_quick_start.py deleted file mode 100644 index 99360f42..00000000 --- a/docs/scrapper/web_crawler_quick_start.py +++ /dev/null @@ -1,111 +0,0 @@ -import unittest, os -from crawl4ai.web_crawler import WebCrawler -from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking -from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy - -class TestWebCrawler(unittest.TestCase): - - def setUp(self): - self.crawler = WebCrawler() - - def test_warmup(self): - self.crawler.warmup() - self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up") - - def test_run_default_strategies(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=RegexChunking(), - extraction_strategy=CosineStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract using default strategies") - - def test_run_different_strategies(self): - url = 'https://www.nbcnews.com/business' - - # Test with FixedLengthWordChunking and LLMExtractionStrategy - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=FixedLengthWordChunking(chunk_size=100), - extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy") - - # Test with SlidingWindowChunking and TopicExtractionStrategy - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=SlidingWindowChunking(window_size=100, step=50), - extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy") - - def test_invalid_url(self): - with self.assertRaises(Exception) as context: - self.crawler.run(url='invalid_url', bypass_cache=True) - self.assertIn("Invalid URL", str(context.exception)) - - def test_unsupported_extraction_strategy(self): - with self.assertRaises(Exception) as context: - self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True) - self.assertIn("Unsupported extraction strategy", str(context.exception)) - - def test_invalid_css_selector(self): - with self.assertRaises(ValueError) as context: - self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True) - self.assertIn("Invalid CSS selector", str(context.exception)) - - - def test_crawl_with_cache_and_bypass_cache(self): - url = 'https://www.nbcnews.com/business' - - # First crawl with cache enabled - result = self.crawler.run(url=url, bypass_cache=False) - self.assertTrue(result.success, "Failed to crawl and cache the result") - - # Second crawl with bypass_cache=True - result = self.crawler.run(url=url, bypass_cache=True) - self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data") - - def test_fetch_multiple_pages(self): - urls = [ - 'https://www.nbcnews.com/business', - 'https://www.bbc.com/news' - ] - results = [] - for url in urls: - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=RegexChunking(), - extraction_strategy=CosineStrategy(), - bypass_cache=True - ) - results.append(result) - - self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages") - for result in results: - self.assertTrue(result.success, "Failed to crawl and extract a page in the list") - - def test_run_fixed_length_word_chunking_and_no_extraction(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=FixedLengthWordChunking(chunk_size=100), - extraction_strategy=NoExtractionStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy") - - def test_run_sliding_window_and_no_extraction(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=SlidingWindowChunking(window_size=100, step=50), - extraction_strategy=NoExtractionStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy") - -if __name__ == '__main__': - unittest.main()