feat(deep-crawling): improve URL normalization and domain filtering
Enhance URL handling in deep crawling with: - New URL normalization functions for consistent URL formats - Improved domain filtering with subdomain support - Added URLPatternFilter to public API - Better URL deduplication in BFS strategy These changes improve crawling accuracy and reduce duplicate visits.
This commit is contained in:
@@ -48,8 +48,9 @@ from .deep_crawling import (
|
||||
DeepCrawlStrategy,
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
ContentTypeFilter,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
URLFilter,
|
||||
FilterStats,
|
||||
SEOFilter,
|
||||
@@ -75,6 +76,7 @@ __all__ = [
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FilterChain",
|
||||
"URLPatternFilter",
|
||||
"ContentTypeFilter",
|
||||
"DomainFilter",
|
||||
"FilterStats",
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.5.0.post3"
|
||||
__version__ = "0.5.0.post4"
|
||||
|
||||
@@ -10,6 +10,7 @@ from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from . import DeepCrawlStrategy
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||||
from math import inf as infinity
|
||||
|
||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
# First collect all valid links
|
||||
for link in links:
|
||||
url = link.get("href")
|
||||
if url in visited:
|
||||
# Strip URL fragments to avoid duplicate crawling
|
||||
# base_url = url.split('#')[0] if url else url
|
||||
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||
if base_url in visited:
|
||||
continue
|
||||
if not await self.can_process_url(url, next_depth):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
# Score the URL if a scorer is provided
|
||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
||||
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||||
|
||||
# Skip URLs with scores below the threshold
|
||||
if score < self.score_threshold:
|
||||
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
valid_links.append((url, score))
|
||||
valid_links.append((base_url, score))
|
||||
|
||||
# If we have more valid links than capacity, sort by score and take the top ones
|
||||
if len(valid_links) > remaining_capacity:
|
||||
|
||||
@@ -427,6 +427,11 @@ class DomainFilter(URLFilter):
|
||||
if isinstance(domains, str):
|
||||
return {domains.lower()}
|
||||
return {d.lower() for d in domains}
|
||||
|
||||
@staticmethod
|
||||
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||||
"""Check if domain is a subdomain of parent_domain"""
|
||||
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=10000)
|
||||
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
|
||||
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
# Early return for blocked domains
|
||||
if domain in self._blocked_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
# Check for blocked domains, including subdomains
|
||||
for blocked in self._blocked_domains:
|
||||
if self._is_subdomain(domain, blocked):
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
# If no allowed domains specified, accept all non-blocked
|
||||
if self._allowed_domains is None:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Final allowed domains check
|
||||
result = domain in self._allowed_domains
|
||||
self._update_stats(result)
|
||||
return result
|
||||
# Check if domain matches any allowed domain (including subdomains)
|
||||
for allowed in self._allowed_domains:
|
||||
if self._is_subdomain(domain, allowed):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# No matches found
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
|
||||
class ContentRelevanceFilter(URLFilter):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||
import json
|
||||
@@ -33,6 +32,8 @@ import hashlib
|
||||
|
||||
from urllib.robotparser import RobotFileParser
|
||||
import aiohttp
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from functools import lru_cache
|
||||
|
||||
from packaging import version
|
||||
from . import __version__
|
||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
# Handle None or empty values
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Use urljoin to handle relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Parse the URL for normalization
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Convert hostname to lowercase
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# Remove fragment entirely
|
||||
fragment = ''
|
||||
|
||||
# Normalize query parameters if needed
|
||||
query = parsed.query
|
||||
if query:
|
||||
# Parse query parameters
|
||||
params = parse_qs(query)
|
||||
|
||||
# Remove tracking parameters (example - customize as needed)
|
||||
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||
for param in tracking_params:
|
||||
if param in params:
|
||||
del params[param]
|
||||
|
||||
# Rebuild query string, sorted for consistency
|
||||
query = urlencode(params, doseq=True) if params else ''
|
||||
|
||||
# Build normalized URL
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Efficient URL normalization with proper parsing"""
|
||||
from urllib.parse import urljoin
|
||||
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Use proper URL parsing
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Only perform the most critical normalizations
|
||||
# 1. Lowercase hostname
|
||||
# 2. Remove fragment
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_tmp(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
# Extract protocol and domain from base URL
|
||||
|
||||
78
docs/snippets/deep_crawl/intro.py
Normal file
78
docs/snippets/deep_crawl/intro.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
CrawlResult,
|
||||
FilterChain,
|
||||
DomainFilter,
|
||||
URLPatternFilter,
|
||||
)
|
||||
|
||||
# Import necessary classes from crawl4ai library:
|
||||
# - AsyncWebCrawler: The main class for web crawling.
|
||||
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
||||
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
||||
# - CrawlResult: Data model for individual crawl results.
|
||||
# - FilterChain: Used to chain multiple URL filters.
|
||||
# - URLPatternFilter: Filter URLs based on patterns.
|
||||
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
||||
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
||||
|
||||
async def basic_deep_crawl():
|
||||
"""
|
||||
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
||||
- Breadth-First Search (BFS) deep crawling strategy.
|
||||
- Filtering URLs based on URL patterns.
|
||||
- Accessing crawl results and metadata.
|
||||
"""
|
||||
|
||||
# 1. Define URL Filters:
|
||||
# Create a URLPatternFilter to include only URLs containing "text".
|
||||
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
||||
url_filter = URLPatternFilter(
|
||||
patterns=[
|
||||
"*text*", # Include URLs that contain "text" in their path or URL
|
||||
]
|
||||
)
|
||||
|
||||
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
||||
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["groq.com"],
|
||||
blocked_domains=["example.com"],
|
||||
)
|
||||
|
||||
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
||||
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
||||
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
||||
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
||||
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
||||
),
|
||||
verbose=True, # Enable verbose logging to see detailed output during crawling
|
||||
)
|
||||
|
||||
# 3. Initialize and Run AsyncWebCrawler:
|
||||
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results: List[CrawlResult] = await crawler.arun(
|
||||
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
||||
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
||||
config=config, # Pass the configured CrawlerRunConfig to arun method
|
||||
)
|
||||
|
||||
# 4. Process and Print Crawl Results:
|
||||
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
||||
for result in results:
|
||||
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
||||
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(basic_deep_crawl())
|
||||
Reference in New Issue
Block a user