feat(deep-crawling): improve URL normalization and domain filtering
Enhance URL handling in deep crawling with: - New URL normalization functions for consistent URL formats - Improved domain filtering with subdomain support - Added URLPatternFilter to public API - Better URL deduplication in BFS strategy These changes improve crawling accuracy and reduce duplicate visits.
This commit is contained in:
@@ -48,8 +48,9 @@ from .deep_crawling import (
|
|||||||
DeepCrawlStrategy,
|
DeepCrawlStrategy,
|
||||||
BFSDeepCrawlStrategy,
|
BFSDeepCrawlStrategy,
|
||||||
FilterChain,
|
FilterChain,
|
||||||
ContentTypeFilter,
|
URLPatternFilter,
|
||||||
DomainFilter,
|
DomainFilter,
|
||||||
|
ContentTypeFilter,
|
||||||
URLFilter,
|
URLFilter,
|
||||||
FilterStats,
|
FilterStats,
|
||||||
SEOFilter,
|
SEOFilter,
|
||||||
@@ -75,6 +76,7 @@ __all__ = [
|
|||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
"DFSDeepCrawlStrategy",
|
"DFSDeepCrawlStrategy",
|
||||||
"FilterChain",
|
"FilterChain",
|
||||||
|
"URLPatternFilter",
|
||||||
"ContentTypeFilter",
|
"ContentTypeFilter",
|
||||||
"DomainFilter",
|
"DomainFilter",
|
||||||
"FilterStats",
|
"FilterStats",
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0.post3"
|
__version__ = "0.5.0.post4"
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from .filters import FilterChain
|
|||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||||
|
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||||||
from math import inf as infinity
|
from math import inf as infinity
|
||||||
|
|
||||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||||
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# First collect all valid links
|
# First collect all valid links
|
||||||
for link in links:
|
for link in links:
|
||||||
url = link.get("href")
|
url = link.get("href")
|
||||||
if url in visited:
|
# Strip URL fragments to avoid duplicate crawling
|
||||||
|
# base_url = url.split('#')[0] if url else url
|
||||||
|
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||||
|
if base_url in visited:
|
||||||
continue
|
continue
|
||||||
if not await self.can_process_url(url, next_depth):
|
if not await self.can_process_url(url, next_depth):
|
||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Score the URL if a scorer is provided
|
# Score the URL if a scorer is provided
|
||||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||||||
|
|
||||||
# Skip URLs with scores below the threshold
|
# Skip URLs with scores below the threshold
|
||||||
if score < self.score_threshold:
|
if score < self.score_threshold:
|
||||||
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_links.append((url, score))
|
valid_links.append((base_url, score))
|
||||||
|
|
||||||
# If we have more valid links than capacity, sort by score and take the top ones
|
# If we have more valid links than capacity, sort by score and take the top ones
|
||||||
if len(valid_links) > remaining_capacity:
|
if len(valid_links) > remaining_capacity:
|
||||||
|
|||||||
@@ -428,6 +428,11 @@ class DomainFilter(URLFilter):
|
|||||||
return {domains.lower()}
|
return {domains.lower()}
|
||||||
return {d.lower() for d in domains}
|
return {d.lower() for d in domains}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||||||
|
"""Check if domain is a subdomain of parent_domain"""
|
||||||
|
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def _extract_domain(url: str) -> str:
|
def _extract_domain(url: str) -> str:
|
||||||
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
|
|||||||
|
|
||||||
domain = self._extract_domain(url)
|
domain = self._extract_domain(url)
|
||||||
|
|
||||||
# Early return for blocked domains
|
# Check for blocked domains, including subdomains
|
||||||
if domain in self._blocked_domains:
|
for blocked in self._blocked_domains:
|
||||||
self._update_stats(False)
|
if self._is_subdomain(domain, blocked):
|
||||||
return False
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
# If no allowed domains specified, accept all non-blocked
|
# If no allowed domains specified, accept all non-blocked
|
||||||
if self._allowed_domains is None:
|
if self._allowed_domains is None:
|
||||||
self._update_stats(True)
|
self._update_stats(True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Final allowed domains check
|
# Check if domain matches any allowed domain (including subdomains)
|
||||||
result = domain in self._allowed_domains
|
for allowed in self._allowed_domains:
|
||||||
self._update_stats(result)
|
if self._is_subdomain(domain, allowed):
|
||||||
return result
|
self._update_stats(True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# No matches found
|
||||||
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class ContentRelevanceFilter(URLFilter):
|
class ContentRelevanceFilter(URLFilter):
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import time
|
import time
|
||||||
from urllib.parse import urlparse
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||||
import json
|
import json
|
||||||
@@ -33,6 +32,8 @@ import hashlib
|
|||||||
|
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from packaging import version
|
from packaging import version
|
||||||
from . import __version__
|
from . import __version__
|
||||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Normalize URLs to ensure consistent format"""
|
||||||
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
|
# Handle None or empty values
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use urljoin to handle relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Parse the URL for normalization
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Convert hostname to lowercase
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove fragment entirely
|
||||||
|
fragment = ''
|
||||||
|
|
||||||
|
# Normalize query parameters if needed
|
||||||
|
query = parsed.query
|
||||||
|
if query:
|
||||||
|
# Parse query parameters
|
||||||
|
params = parse_qs(query)
|
||||||
|
|
||||||
|
# Remove tracking parameters (example - customize as needed)
|
||||||
|
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||||
|
for param in tracking_params:
|
||||||
|
if param in params:
|
||||||
|
del params[param]
|
||||||
|
|
||||||
|
# Rebuild query string, sorted for consistency
|
||||||
|
query = urlencode(params, doseq=True) if params else ''
|
||||||
|
|
||||||
|
# Build normalized URL
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
netloc,
|
||||||
|
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||||
|
parsed.params,
|
||||||
|
query,
|
||||||
|
fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
@lru_cache(maxsize=10000)
|
||||||
|
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Efficient URL normalization with proper parsing"""
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Use proper URL parsing
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Only perform the most critical normalizations
|
||||||
|
# 1. Lowercase hostname
|
||||||
|
# 2. Remove fragment
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
parsed.netloc.lower(),
|
||||||
|
parsed.path,
|
||||||
|
parsed.params,
|
||||||
|
parsed.query,
|
||||||
|
'' # Remove fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_tmp(href, base_url):
|
def normalize_url_tmp(href, base_url):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
# Extract protocol and domain from base URL
|
# Extract protocol and domain from base URL
|
||||||
|
|||||||
78
docs/snippets/deep_crawl/intro.py
Normal file
78
docs/snippets/deep_crawl/intro.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import asyncio
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
CrawlResult,
|
||||||
|
FilterChain,
|
||||||
|
DomainFilter,
|
||||||
|
URLPatternFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import necessary classes from crawl4ai library:
|
||||||
|
# - AsyncWebCrawler: The main class for web crawling.
|
||||||
|
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
||||||
|
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
||||||
|
# - CrawlResult: Data model for individual crawl results.
|
||||||
|
# - FilterChain: Used to chain multiple URL filters.
|
||||||
|
# - URLPatternFilter: Filter URLs based on patterns.
|
||||||
|
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
||||||
|
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
||||||
|
|
||||||
|
async def basic_deep_crawl():
|
||||||
|
"""
|
||||||
|
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
||||||
|
- Breadth-First Search (BFS) deep crawling strategy.
|
||||||
|
- Filtering URLs based on URL patterns.
|
||||||
|
- Accessing crawl results and metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 1. Define URL Filters:
|
||||||
|
# Create a URLPatternFilter to include only URLs containing "text".
|
||||||
|
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=[
|
||||||
|
"*text*", # Include URLs that contain "text" in their path or URL
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
||||||
|
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
||||||
|
domain_filter = DomainFilter(
|
||||||
|
allowed_domains=["groq.com"],
|
||||||
|
blocked_domains=["example.com"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
||||||
|
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
||||||
|
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
||||||
|
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
||||||
|
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
||||||
|
),
|
||||||
|
verbose=True, # Enable verbose logging to see detailed output during crawling
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Initialize and Run AsyncWebCrawler:
|
||||||
|
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
|
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
||||||
|
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
||||||
|
config=config, # Pass the configured CrawlerRunConfig to arun method
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Process and Print Crawl Results:
|
||||||
|
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
||||||
|
for result in results:
|
||||||
|
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
||||||
|
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(basic_deep_crawl())
|
||||||
Reference in New Issue
Block a user