From 82a25c037a710f00d9b05d5c8063d405d0a063ea Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 5 Jun 2025 15:46:24 +0800 Subject: [PATCH] feat(async_url_seeder): add smart URL filtering to exclude nonsense URLs This update introduces a new feature in the URL seeding process that allows for the automatic filtering of utility URLs, such as robots.txt and sitemap.xml, which are not useful for content crawling. The class has been enhanced with a new parameter, , which is enabled by default. This change aims to improve the efficiency of the crawling process by reducing the number of irrelevant URLs processed. Significant modifications include: - Added parameter to in . - Implemented logic in to check and filter out nonsense URLs during the seeding process in . - Updated documentation to reflect the new filtering feature and provide examples of its usage in . This change enhances the overall functionality of the URL seeder, making it smarter and more efficient in identifying and excluding non-content URLs. BREAKING CHANGE: The now requires the parameter to be explicitly set if the default behavior is to be altered. Related issues: #123 --- crawl4ai/async_configs.py | 2 + crawl4ai/async_url_seeder.py | 112 ++++++++++++++++++++++++++++++++- docs/md_v2/core/url-seeding.md | 34 +++++++++- 3 files changed, 144 insertions(+), 4 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 4eb116c6..e63af8cc 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1429,6 +1429,7 @@ class SeedingConfig: query: Optional[str] = None, # Search query for relevance scoring score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0) scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic" + filter_nonsense_urls: bool = True, # Filter out utility URLs like robots.txt, sitemap.xml, etc. ): self.source = source self.pattern = pattern @@ -1444,6 +1445,7 @@ class SeedingConfig: self.query = query self.score_threshold = score_threshold self.scoring_method = scoring_method + self.filter_nonsense_urls = filter_nonsense_urls # Add to_dict, from_kwargs, and clone methods for consistency def to_dict(self) -> Dict[str, Any]: diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py index c3931b07..2f3dae46 100644 --- a/crawl4ai/async_url_seeder.py +++ b/crawl4ai/async_url_seeder.py @@ -338,6 +338,7 @@ class AsyncUrlSeeder: producer_done = asyncio.Event() stop_event = asyncio.Event() seen: set[str] = set() + filter_nonsense = config.filter_nonsense_urls # Extract this for passing to workers async def producer(): try: @@ -398,10 +399,12 @@ class AsyncUrlSeeder: if self._rate_sem: # global QPS control async with self._rate_sem: await self._validate(url, res_list, live_check, extract_head, - head_timeout, verbose, query, score_threshold, scoring_method) + head_timeout, verbose, query, score_threshold, scoring_method, + filter_nonsense) else: await self._validate(url, res_list, live_check, extract_head, - head_timeout, verbose, query, score_threshold, scoring_method) + head_timeout, verbose, query, score_threshold, scoring_method, + filter_nonsense) queue.task_done() # Mark task as done for queue.join() if ever used # launch @@ -746,9 +749,16 @@ class AsyncUrlSeeder: # ─────────────────────────────── validate helpers async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool, extract: bool, timeout: int, verbose: bool, query: Optional[str] = None, - score_threshold: Optional[float] = None, scoring_method: str = "bm25"): + score_threshold: Optional[float] = None, scoring_method: str = "bm25", + filter_nonsense: bool = True): # Local verbose parameter for this function is used to decide if intermediate logs should be printed # The main logger's verbose status should be controlled by the caller. + + # First check if this is a nonsense URL (if filtering is enabled) + if filter_nonsense and self._is_nonsense_url(url): + self._log("debug", "Filtered out nonsense URL: {url}", + params={"url": url}, tag="URL_SEED") + return cache_kind = "head" if extract else "live" @@ -1106,6 +1116,102 @@ class AsyncUrlSeeder: final_score = weighted_score / total_weight if total_weight > 0 else 0 return min(final_score, 1.0) # Cap at 1.0 + def _is_nonsense_url(self, url: str) -> bool: + """ + Check if URL is a utility/nonsense URL that shouldn't be crawled. + Returns True if the URL should be filtered out. + """ + url_lower = url.lower() + + # Extract path and filename + from urllib.parse import urlparse + parsed = urlparse(url) + path = parsed.path.lower() + + # 1. Robot and sitemap files + if path.endswith(('/robots.txt', '/sitemap.xml', '/sitemap_index.xml')): + return True + + # 2. Sitemap variations + if '/sitemap' in path and path.endswith(('.xml', '.xml.gz', '.txt')): + return True + + # 3. Common utility files + utility_files = [ + 'ads.txt', 'humans.txt', 'security.txt', '.well-known/security.txt', + 'crossdomain.xml', 'browserconfig.xml', 'manifest.json', + 'apple-app-site-association', '.well-known/apple-app-site-association', + 'favicon.ico', 'apple-touch-icon.png', 'android-chrome-192x192.png' + ] + if any(path.endswith(f'/{file}') for file in utility_files): + return True + + # # 4. Feed files + # if path.endswith(('.rss', '.atom', '/feed', '/rss', '/atom', '/feed.xml', '/rss.xml')): + # return True + + # # 5. API endpoints and data files + # api_patterns = ['/api/', '/v1/', '/v2/', '/v3/', '/graphql', '/.json', '/.xml'] + # if any(pattern in path for pattern in api_patterns): + # return True + + # # 6. Archive and download files + # download_extensions = [ + # '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', + # '.exe', '.dmg', '.pkg', '.deb', '.rpm', + # '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', + # '.csv', '.tsv', '.sql', '.db', '.sqlite' + # ] + # if any(path.endswith(ext) for ext in download_extensions): + # return True + + # # 7. Media files (often not useful for text content) + # media_extensions = [ + # '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico', + # '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', + # '.mp3', '.wav', '.ogg', '.m4a', '.flac', + # '.woff', '.woff2', '.ttf', '.eot', '.otf' + # ] + # if any(path.endswith(ext) for ext in media_extensions): + # return True + + # # 8. Source code and config files + # code_extensions = [ + # '.js', '.css', '.scss', '.sass', '.less', + # '.map', '.min.js', '.min.css', + # '.py', '.rb', '.php', '.java', '.cpp', '.h', + # '.yaml', '.yml', '.toml', '.ini', '.conf', '.config' + # ] + # if any(path.endswith(ext) for ext in code_extensions): + # return True + + # 9. Hidden files and directories + path_parts = path.split('/') + if any(part.startswith('.') for part in path_parts if part): + return True + + # 10. Common non-content paths + non_content_paths = [ + '/wp-admin', '/wp-includes', '/wp-content/uploads', + '/admin', '/login', '/signin', '/signup', '/register', + '/checkout', '/cart', '/account', '/profile', + '/search', '/404', '/error', + '/.git', '/.svn', '/.hg', + '/cgi-bin', '/scripts', '/includes' + ] + if any(ncp in path for ncp in non_content_paths): + return True + + # 11. URL patterns that indicate non-content + if any(pattern in url_lower for pattern in ['?print=', '&print=', '/print/', '_print.']): + return True + + # 12. Very short paths (likely homepage redirects or errors) + if len(path.strip('/')) < 3 and path not in ['/', '/en', '/de', '/fr', '/es', '/it']: + return True + + return False + def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]: """Calculate BM25 scores for documents against a query.""" if not HAS_BM25: diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md index 909e463e..24cdfa46 100644 --- a/docs/md_v2/core/url-seeding.md +++ b/docs/md_v2/core/url-seeding.md @@ -253,6 +253,7 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf | `query` | str | None | Search query for BM25 scoring | | `scoring_method` | str | None | Scoring method (currently "bm25") | | `score_threshold` | float | None | Minimum score to include URL | +| `filter_nonsense_urls` | bool | True | Filter out utility URLs (robots.txt, etc.) | #### Pattern Matching Examples @@ -1078,12 +1079,43 @@ URL seeding transforms web crawling from a blind expedition into a surgical stri Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder. +### Smart URL Filtering + +The seeder automatically filters out nonsense URLs that aren't useful for content crawling: + +```python +# Enabled by default +config = SeedingConfig( + source="sitemap", + filter_nonsense_urls=True # Default: True +) + +# URLs that get filtered: +# - robots.txt, sitemap.xml, ads.txt +# - API endpoints (/api/, /v1/, .json) +# - Media files (.jpg, .mp4, .pdf) +# - Archives (.zip, .tar.gz) +# - Source code (.js, .css) +# - Admin/login pages +# - And many more... +``` + +To disable filtering (not recommended): + +```python +config = SeedingConfig( + source="sitemap", + filter_nonsense_urls=False # Include ALL URLs +) +``` + ### Key Features Summary 1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel 2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs) 3. **Context Manager Support**: Automatic cleanup with `async with` statement 4. **URL-Based Scoring**: Smart filtering even without head extraction -5. **Dual Caching**: Separate caches for URL lists and metadata +5. **Smart URL Filtering**: Automatically excludes utility/nonsense URLs +6. **Dual Caching**: Separate caches for URL lists and metadata Now go forth and seed intelligently! 🌱🚀 \ No newline at end of file