feat(async_url_seeder): add smart URL filtering to exclude nonsense URLs

This update introduces a new feature in the URL seeding process that allows for the automatic filtering of utility URLs, such as robots.txt and sitemap.xml, which are not useful for content crawling. The  class has been enhanced with a new parameter, , which is enabled by default. This change aims to improve the efficiency of the crawling process by reducing the number of irrelevant URLs processed.

Significant modifications include:
- Added  parameter to  in .
- Implemented logic in  to check and filter out nonsense URLs during the seeding process in .
- Updated documentation to reflect the new filtering feature and provide examples of its usage in .

This change enhances the overall functionality of the URL seeder, making it smarter and more efficient in identifying and excluding non-content URLs.

BREAKING CHANGE: The  now requires the  parameter to be explicitly set if the default behavior is to be altered.

Related issues: #123
This commit is contained in:
UncleCode
2025-06-05 15:46:24 +08:00
parent c6fc5c0518
commit 82a25c037a
3 changed files with 144 additions and 4 deletions

View File

@@ -1429,6 +1429,7 @@ class SeedingConfig:
query: Optional[str] = None, # Search query for relevance scoring query: Optional[str] = None, # Search query for relevance scoring
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0) score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic" scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
filter_nonsense_urls: bool = True, # Filter out utility URLs like robots.txt, sitemap.xml, etc.
): ):
self.source = source self.source = source
self.pattern = pattern self.pattern = pattern
@@ -1444,6 +1445,7 @@ class SeedingConfig:
self.query = query self.query = query
self.score_threshold = score_threshold self.score_threshold = score_threshold
self.scoring_method = scoring_method self.scoring_method = scoring_method
self.filter_nonsense_urls = filter_nonsense_urls
# Add to_dict, from_kwargs, and clone methods for consistency # Add to_dict, from_kwargs, and clone methods for consistency
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]:

View File

@@ -338,6 +338,7 @@ class AsyncUrlSeeder:
producer_done = asyncio.Event() producer_done = asyncio.Event()
stop_event = asyncio.Event() stop_event = asyncio.Event()
seen: set[str] = set() seen: set[str] = set()
filter_nonsense = config.filter_nonsense_urls # Extract this for passing to workers
async def producer(): async def producer():
try: try:
@@ -398,10 +399,12 @@ class AsyncUrlSeeder:
if self._rate_sem: # global QPS control if self._rate_sem: # global QPS control
async with self._rate_sem: async with self._rate_sem:
await self._validate(url, res_list, live_check, extract_head, await self._validate(url, res_list, live_check, extract_head,
head_timeout, verbose, query, score_threshold, scoring_method) head_timeout, verbose, query, score_threshold, scoring_method,
filter_nonsense)
else: else:
await self._validate(url, res_list, live_check, extract_head, await self._validate(url, res_list, live_check, extract_head,
head_timeout, verbose, query, score_threshold, scoring_method) head_timeout, verbose, query, score_threshold, scoring_method,
filter_nonsense)
queue.task_done() # Mark task as done for queue.join() if ever used queue.task_done() # Mark task as done for queue.join() if ever used
# launch # launch
@@ -746,9 +749,16 @@ class AsyncUrlSeeder:
# ─────────────────────────────── validate helpers # ─────────────────────────────── validate helpers
async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool, async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
extract: bool, timeout: int, verbose: bool, query: Optional[str] = None, extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
score_threshold: Optional[float] = None, scoring_method: str = "bm25"): score_threshold: Optional[float] = None, scoring_method: str = "bm25",
filter_nonsense: bool = True):
# Local verbose parameter for this function is used to decide if intermediate logs should be printed # Local verbose parameter for this function is used to decide if intermediate logs should be printed
# The main logger's verbose status should be controlled by the caller. # The main logger's verbose status should be controlled by the caller.
# First check if this is a nonsense URL (if filtering is enabled)
if filter_nonsense and self._is_nonsense_url(url):
self._log("debug", "Filtered out nonsense URL: {url}",
params={"url": url}, tag="URL_SEED")
return
cache_kind = "head" if extract else "live" cache_kind = "head" if extract else "live"
@@ -1106,6 +1116,102 @@ class AsyncUrlSeeder:
final_score = weighted_score / total_weight if total_weight > 0 else 0 final_score = weighted_score / total_weight if total_weight > 0 else 0
return min(final_score, 1.0) # Cap at 1.0 return min(final_score, 1.0) # Cap at 1.0
def _is_nonsense_url(self, url: str) -> bool:
"""
Check if URL is a utility/nonsense URL that shouldn't be crawled.
Returns True if the URL should be filtered out.
"""
url_lower = url.lower()
# Extract path and filename
from urllib.parse import urlparse
parsed = urlparse(url)
path = parsed.path.lower()
# 1. Robot and sitemap files
if path.endswith(('/robots.txt', '/sitemap.xml', '/sitemap_index.xml')):
return True
# 2. Sitemap variations
if '/sitemap' in path and path.endswith(('.xml', '.xml.gz', '.txt')):
return True
# 3. Common utility files
utility_files = [
'ads.txt', 'humans.txt', 'security.txt', '.well-known/security.txt',
'crossdomain.xml', 'browserconfig.xml', 'manifest.json',
'apple-app-site-association', '.well-known/apple-app-site-association',
'favicon.ico', 'apple-touch-icon.png', 'android-chrome-192x192.png'
]
if any(path.endswith(f'/{file}') for file in utility_files):
return True
# # 4. Feed files
# if path.endswith(('.rss', '.atom', '/feed', '/rss', '/atom', '/feed.xml', '/rss.xml')):
# return True
# # 5. API endpoints and data files
# api_patterns = ['/api/', '/v1/', '/v2/', '/v3/', '/graphql', '/.json', '/.xml']
# if any(pattern in path for pattern in api_patterns):
# return True
# # 6. Archive and download files
# download_extensions = [
# '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2',
# '.exe', '.dmg', '.pkg', '.deb', '.rpm',
# '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
# '.csv', '.tsv', '.sql', '.db', '.sqlite'
# ]
# if any(path.endswith(ext) for ext in download_extensions):
# return True
# # 7. Media files (often not useful for text content)
# media_extensions = [
# '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico',
# '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm',
# '.mp3', '.wav', '.ogg', '.m4a', '.flac',
# '.woff', '.woff2', '.ttf', '.eot', '.otf'
# ]
# if any(path.endswith(ext) for ext in media_extensions):
# return True
# # 8. Source code and config files
# code_extensions = [
# '.js', '.css', '.scss', '.sass', '.less',
# '.map', '.min.js', '.min.css',
# '.py', '.rb', '.php', '.java', '.cpp', '.h',
# '.yaml', '.yml', '.toml', '.ini', '.conf', '.config'
# ]
# if any(path.endswith(ext) for ext in code_extensions):
# return True
# 9. Hidden files and directories
path_parts = path.split('/')
if any(part.startswith('.') for part in path_parts if part):
return True
# 10. Common non-content paths
non_content_paths = [
'/wp-admin', '/wp-includes', '/wp-content/uploads',
'/admin', '/login', '/signin', '/signup', '/register',
'/checkout', '/cart', '/account', '/profile',
'/search', '/404', '/error',
'/.git', '/.svn', '/.hg',
'/cgi-bin', '/scripts', '/includes'
]
if any(ncp in path for ncp in non_content_paths):
return True
# 11. URL patterns that indicate non-content
if any(pattern in url_lower for pattern in ['?print=', '&print=', '/print/', '_print.']):
return True
# 12. Very short paths (likely homepage redirects or errors)
if len(path.strip('/')) < 3 and path not in ['/', '/en', '/de', '/fr', '/es', '/it']:
return True
return False
def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]: def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
"""Calculate BM25 scores for documents against a query.""" """Calculate BM25 scores for documents against a query."""
if not HAS_BM25: if not HAS_BM25:

View File

@@ -253,6 +253,7 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
| `query` | str | None | Search query for BM25 scoring | | `query` | str | None | Search query for BM25 scoring |
| `scoring_method` | str | None | Scoring method (currently "bm25") | | `scoring_method` | str | None | Scoring method (currently "bm25") |
| `score_threshold` | float | None | Minimum score to include URL | | `score_threshold` | float | None | Minimum score to include URL |
| `filter_nonsense_urls` | bool | True | Filter out utility URLs (robots.txt, etc.) |
#### Pattern Matching Examples #### Pattern Matching Examples
@@ -1078,12 +1079,43 @@ URL seeding transforms web crawling from a blind expedition into a surgical stri
Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder. Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder.
### Smart URL Filtering
The seeder automatically filters out nonsense URLs that aren't useful for content crawling:
```python
# Enabled by default
config = SeedingConfig(
source="sitemap",
filter_nonsense_urls=True # Default: True
)
# URLs that get filtered:
# - robots.txt, sitemap.xml, ads.txt
# - API endpoints (/api/, /v1/, .json)
# - Media files (.jpg, .mp4, .pdf)
# - Archives (.zip, .tar.gz)
# - Source code (.js, .css)
# - Admin/login pages
# - And many more...
```
To disable filtering (not recommended):
```python
config = SeedingConfig(
source="sitemap",
filter_nonsense_urls=False # Include ALL URLs
)
```
### Key Features Summary ### Key Features Summary
1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel 1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel
2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs) 2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs)
3. **Context Manager Support**: Automatic cleanup with `async with` statement 3. **Context Manager Support**: Automatic cleanup with `async with` statement
4. **URL-Based Scoring**: Smart filtering even without head extraction 4. **URL-Based Scoring**: Smart filtering even without head extraction
5. **Dual Caching**: Separate caches for URL lists and metadata 5. **Smart URL Filtering**: Automatically excludes utility/nonsense URLs
6. **Dual Caching**: Separate caches for URL lists and metadata
Now go forth and seed intelligently! 🌱🚀 Now go forth and seed intelligently! 🌱🚀