feat(async_url_seeder): add smart URL filtering to exclude nonsense URLs
This update introduces a new feature in the URL seeding process that allows for the automatic filtering of utility URLs, such as robots.txt and sitemap.xml, which are not useful for content crawling. The class has been enhanced with a new parameter, , which is enabled by default. This change aims to improve the efficiency of the crawling process by reducing the number of irrelevant URLs processed. Significant modifications include: - Added parameter to in . - Implemented logic in to check and filter out nonsense URLs during the seeding process in . - Updated documentation to reflect the new filtering feature and provide examples of its usage in . This change enhances the overall functionality of the URL seeder, making it smarter and more efficient in identifying and excluding non-content URLs. BREAKING CHANGE: The now requires the parameter to be explicitly set if the default behavior is to be altered. Related issues: #123
This commit is contained in:
@@ -1429,6 +1429,7 @@ class SeedingConfig:
|
|||||||
query: Optional[str] = None, # Search query for relevance scoring
|
query: Optional[str] = None, # Search query for relevance scoring
|
||||||
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
|
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
|
||||||
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
|
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
|
||||||
|
filter_nonsense_urls: bool = True, # Filter out utility URLs like robots.txt, sitemap.xml, etc.
|
||||||
):
|
):
|
||||||
self.source = source
|
self.source = source
|
||||||
self.pattern = pattern
|
self.pattern = pattern
|
||||||
@@ -1444,6 +1445,7 @@ class SeedingConfig:
|
|||||||
self.query = query
|
self.query = query
|
||||||
self.score_threshold = score_threshold
|
self.score_threshold = score_threshold
|
||||||
self.scoring_method = scoring_method
|
self.scoring_method = scoring_method
|
||||||
|
self.filter_nonsense_urls = filter_nonsense_urls
|
||||||
|
|
||||||
# Add to_dict, from_kwargs, and clone methods for consistency
|
# Add to_dict, from_kwargs, and clone methods for consistency
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
|||||||
@@ -338,6 +338,7 @@ class AsyncUrlSeeder:
|
|||||||
producer_done = asyncio.Event()
|
producer_done = asyncio.Event()
|
||||||
stop_event = asyncio.Event()
|
stop_event = asyncio.Event()
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
|
filter_nonsense = config.filter_nonsense_urls # Extract this for passing to workers
|
||||||
|
|
||||||
async def producer():
|
async def producer():
|
||||||
try:
|
try:
|
||||||
@@ -398,10 +399,12 @@ class AsyncUrlSeeder:
|
|||||||
if self._rate_sem: # global QPS control
|
if self._rate_sem: # global QPS control
|
||||||
async with self._rate_sem:
|
async with self._rate_sem:
|
||||||
await self._validate(url, res_list, live_check, extract_head,
|
await self._validate(url, res_list, live_check, extract_head,
|
||||||
head_timeout, verbose, query, score_threshold, scoring_method)
|
head_timeout, verbose, query, score_threshold, scoring_method,
|
||||||
|
filter_nonsense)
|
||||||
else:
|
else:
|
||||||
await self._validate(url, res_list, live_check, extract_head,
|
await self._validate(url, res_list, live_check, extract_head,
|
||||||
head_timeout, verbose, query, score_threshold, scoring_method)
|
head_timeout, verbose, query, score_threshold, scoring_method,
|
||||||
|
filter_nonsense)
|
||||||
queue.task_done() # Mark task as done for queue.join() if ever used
|
queue.task_done() # Mark task as done for queue.join() if ever used
|
||||||
|
|
||||||
# launch
|
# launch
|
||||||
@@ -746,9 +749,16 @@ class AsyncUrlSeeder:
|
|||||||
# ─────────────────────────────── validate helpers
|
# ─────────────────────────────── validate helpers
|
||||||
async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
|
async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
|
||||||
extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
|
extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
|
||||||
score_threshold: Optional[float] = None, scoring_method: str = "bm25"):
|
score_threshold: Optional[float] = None, scoring_method: str = "bm25",
|
||||||
|
filter_nonsense: bool = True):
|
||||||
# Local verbose parameter for this function is used to decide if intermediate logs should be printed
|
# Local verbose parameter for this function is used to decide if intermediate logs should be printed
|
||||||
# The main logger's verbose status should be controlled by the caller.
|
# The main logger's verbose status should be controlled by the caller.
|
||||||
|
|
||||||
|
# First check if this is a nonsense URL (if filtering is enabled)
|
||||||
|
if filter_nonsense and self._is_nonsense_url(url):
|
||||||
|
self._log("debug", "Filtered out nonsense URL: {url}",
|
||||||
|
params={"url": url}, tag="URL_SEED")
|
||||||
|
return
|
||||||
|
|
||||||
cache_kind = "head" if extract else "live"
|
cache_kind = "head" if extract else "live"
|
||||||
|
|
||||||
@@ -1106,6 +1116,102 @@ class AsyncUrlSeeder:
|
|||||||
final_score = weighted_score / total_weight if total_weight > 0 else 0
|
final_score = weighted_score / total_weight if total_weight > 0 else 0
|
||||||
return min(final_score, 1.0) # Cap at 1.0
|
return min(final_score, 1.0) # Cap at 1.0
|
||||||
|
|
||||||
|
def _is_nonsense_url(self, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if URL is a utility/nonsense URL that shouldn't be crawled.
|
||||||
|
Returns True if the URL should be filtered out.
|
||||||
|
"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
|
||||||
|
# Extract path and filename
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path = parsed.path.lower()
|
||||||
|
|
||||||
|
# 1. Robot and sitemap files
|
||||||
|
if path.endswith(('/robots.txt', '/sitemap.xml', '/sitemap_index.xml')):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 2. Sitemap variations
|
||||||
|
if '/sitemap' in path and path.endswith(('.xml', '.xml.gz', '.txt')):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 3. Common utility files
|
||||||
|
utility_files = [
|
||||||
|
'ads.txt', 'humans.txt', 'security.txt', '.well-known/security.txt',
|
||||||
|
'crossdomain.xml', 'browserconfig.xml', 'manifest.json',
|
||||||
|
'apple-app-site-association', '.well-known/apple-app-site-association',
|
||||||
|
'favicon.ico', 'apple-touch-icon.png', 'android-chrome-192x192.png'
|
||||||
|
]
|
||||||
|
if any(path.endswith(f'/{file}') for file in utility_files):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# # 4. Feed files
|
||||||
|
# if path.endswith(('.rss', '.atom', '/feed', '/rss', '/atom', '/feed.xml', '/rss.xml')):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# # 5. API endpoints and data files
|
||||||
|
# api_patterns = ['/api/', '/v1/', '/v2/', '/v3/', '/graphql', '/.json', '/.xml']
|
||||||
|
# if any(pattern in path for pattern in api_patterns):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# # 6. Archive and download files
|
||||||
|
# download_extensions = [
|
||||||
|
# '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2',
|
||||||
|
# '.exe', '.dmg', '.pkg', '.deb', '.rpm',
|
||||||
|
# '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
||||||
|
# '.csv', '.tsv', '.sql', '.db', '.sqlite'
|
||||||
|
# ]
|
||||||
|
# if any(path.endswith(ext) for ext in download_extensions):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# # 7. Media files (often not useful for text content)
|
||||||
|
# media_extensions = [
|
||||||
|
# '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico',
|
||||||
|
# '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm',
|
||||||
|
# '.mp3', '.wav', '.ogg', '.m4a', '.flac',
|
||||||
|
# '.woff', '.woff2', '.ttf', '.eot', '.otf'
|
||||||
|
# ]
|
||||||
|
# if any(path.endswith(ext) for ext in media_extensions):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# # 8. Source code and config files
|
||||||
|
# code_extensions = [
|
||||||
|
# '.js', '.css', '.scss', '.sass', '.less',
|
||||||
|
# '.map', '.min.js', '.min.css',
|
||||||
|
# '.py', '.rb', '.php', '.java', '.cpp', '.h',
|
||||||
|
# '.yaml', '.yml', '.toml', '.ini', '.conf', '.config'
|
||||||
|
# ]
|
||||||
|
# if any(path.endswith(ext) for ext in code_extensions):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# 9. Hidden files and directories
|
||||||
|
path_parts = path.split('/')
|
||||||
|
if any(part.startswith('.') for part in path_parts if part):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 10. Common non-content paths
|
||||||
|
non_content_paths = [
|
||||||
|
'/wp-admin', '/wp-includes', '/wp-content/uploads',
|
||||||
|
'/admin', '/login', '/signin', '/signup', '/register',
|
||||||
|
'/checkout', '/cart', '/account', '/profile',
|
||||||
|
'/search', '/404', '/error',
|
||||||
|
'/.git', '/.svn', '/.hg',
|
||||||
|
'/cgi-bin', '/scripts', '/includes'
|
||||||
|
]
|
||||||
|
if any(ncp in path for ncp in non_content_paths):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 11. URL patterns that indicate non-content
|
||||||
|
if any(pattern in url_lower for pattern in ['?print=', '&print=', '/print/', '_print.']):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 12. Very short paths (likely homepage redirects or errors)
|
||||||
|
if len(path.strip('/')) < 3 and path not in ['/', '/en', '/de', '/fr', '/es', '/it']:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
|
def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
|
||||||
"""Calculate BM25 scores for documents against a query."""
|
"""Calculate BM25 scores for documents against a query."""
|
||||||
if not HAS_BM25:
|
if not HAS_BM25:
|
||||||
|
|||||||
@@ -253,6 +253,7 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
|||||||
| `query` | str | None | Search query for BM25 scoring |
|
| `query` | str | None | Search query for BM25 scoring |
|
||||||
| `scoring_method` | str | None | Scoring method (currently "bm25") |
|
| `scoring_method` | str | None | Scoring method (currently "bm25") |
|
||||||
| `score_threshold` | float | None | Minimum score to include URL |
|
| `score_threshold` | float | None | Minimum score to include URL |
|
||||||
|
| `filter_nonsense_urls` | bool | True | Filter out utility URLs (robots.txt, etc.) |
|
||||||
|
|
||||||
#### Pattern Matching Examples
|
#### Pattern Matching Examples
|
||||||
|
|
||||||
@@ -1078,12 +1079,43 @@ URL seeding transforms web crawling from a blind expedition into a surgical stri
|
|||||||
|
|
||||||
Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder.
|
Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder.
|
||||||
|
|
||||||
|
### Smart URL Filtering
|
||||||
|
|
||||||
|
The seeder automatically filters out nonsense URLs that aren't useful for content crawling:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Enabled by default
|
||||||
|
config = SeedingConfig(
|
||||||
|
source="sitemap",
|
||||||
|
filter_nonsense_urls=True # Default: True
|
||||||
|
)
|
||||||
|
|
||||||
|
# URLs that get filtered:
|
||||||
|
# - robots.txt, sitemap.xml, ads.txt
|
||||||
|
# - API endpoints (/api/, /v1/, .json)
|
||||||
|
# - Media files (.jpg, .mp4, .pdf)
|
||||||
|
# - Archives (.zip, .tar.gz)
|
||||||
|
# - Source code (.js, .css)
|
||||||
|
# - Admin/login pages
|
||||||
|
# - And many more...
|
||||||
|
```
|
||||||
|
|
||||||
|
To disable filtering (not recommended):
|
||||||
|
|
||||||
|
```python
|
||||||
|
config = SeedingConfig(
|
||||||
|
source="sitemap",
|
||||||
|
filter_nonsense_urls=False # Include ALL URLs
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
### Key Features Summary
|
### Key Features Summary
|
||||||
|
|
||||||
1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel
|
1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel
|
||||||
2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs)
|
2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs)
|
||||||
3. **Context Manager Support**: Automatic cleanup with `async with` statement
|
3. **Context Manager Support**: Automatic cleanup with `async with` statement
|
||||||
4. **URL-Based Scoring**: Smart filtering even without head extraction
|
4. **URL-Based Scoring**: Smart filtering even without head extraction
|
||||||
5. **Dual Caching**: Separate caches for URL lists and metadata
|
5. **Smart URL Filtering**: Automatically excludes utility/nonsense URLs
|
||||||
|
6. **Dual Caching**: Separate caches for URL lists and metadata
|
||||||
|
|
||||||
Now go forth and seed intelligently! 🌱🚀
|
Now go forth and seed intelligently! 🌱🚀
|
||||||
Reference in New Issue
Block a user