Compare commits

..

2 Commits

4 changed files with 169 additions and 200 deletions

View File

@@ -845,15 +845,6 @@ class AsyncUrlSeeder:
return return
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
base_url = str(r.url)
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
normalized = urljoin(base_url, raw.strip())
if not normalized:
return None
return normalized
# Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
is_sitemap_index = False is_sitemap_index = False
@@ -866,42 +857,25 @@ class AsyncUrlSeeder:
# Use XML parser for sitemaps, not HTML parser # Use XML parser for sitemaps, not HTML parser
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser) root = etree.fromstring(data, parser=parser)
# Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
self._log( # Define namespace for sitemap
"debug", ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
"Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemap_loc_nodes),
"url_count": len(url_loc_nodes),
},
tag="URL_SEED",
)
# Check for sitemap index entries # Check for sitemap index entries
if sitemap_loc_nodes: sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
if sitemap_locs:
is_sitemap_index = True is_sitemap_index = True
for sitemap_elem in sitemap_loc_nodes: for sitemap_elem in sitemap_locs:
loc = _normalize_loc(sitemap_elem.text) loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
if loc: if loc:
sub_sitemaps.append(loc) sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs # If not a sitemap index, get regular URLs
if not is_sitemap_index: if not is_sitemap_index:
for loc_elem in url_loc_nodes: for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
loc = _normalize_loc(loc_elem.text) loc = loc_elem.text.strip() if loc_elem.text else ""
if loc: if loc:
regular_urls.append(loc) regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
except Exception as e: except Exception as e:
self._log("error", "LXML parsing error for sitemap {url}: {error}", self._log("error", "LXML parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -918,39 +892,19 @@ class AsyncUrlSeeder:
# Check for sitemap index entries # Check for sitemap index entries
sitemaps = root.findall('.//sitemap') sitemaps = root.findall('.//sitemap')
url_entries = root.findall('.//url')
self._log(
"debug",
"ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemaps),
"url_count": len(url_entries),
},
tag="URL_SEED",
)
if sitemaps: if sitemaps:
is_sitemap_index = True is_sitemap_index = True
for sitemap in sitemaps: for sitemap in sitemaps:
loc_elem = sitemap.find('loc') loc_elem = sitemap.find('loc')
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None) if loc_elem is not None and loc_elem.text:
if loc: sub_sitemaps.append(loc_elem.text.strip())
sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs # If not a sitemap index, get regular URLs
if not is_sitemap_index: if not is_sitemap_index:
for url_elem in url_entries: for url_elem in root.findall('.//url'):
loc_elem = url_elem.find('loc') loc_elem = url_elem.find('loc')
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None) if loc_elem is not None and loc_elem.text:
if loc: regular_urls.append(loc_elem.text.strip())
regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
except Exception as e: except Exception as e:
self._log("error", "ElementTree parsing error for sitemap {url}: {error}", self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")

View File

@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # noqa from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig from ..types import AsyncWebCrawler, CrawlerRunConfig
from ..utils import normalize_url_for_deep_crawl
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
""" """
Depth-First Search (DFS) deep crawling strategy. Depth-first deep crawling with familiar BFS rules.
Inherits URL validation and link discovery from BFSDeepCrawlStrategy. We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal. but walk the graph with a stack so we fully explore one branch before hopping to the
next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
discovery time without accidentally marking them as “already crawled”.
""" """
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dfs_seen: Set[str] = set()
def _reset_seen(self, start_url: str) -> None:
"""Start each crawl with a clean dedupe set seeded with the root URL."""
self._dfs_seen = {start_url}
async def _arun_batch( async def _arun_batch(
self, self,
start_url: str, start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> List[CrawlResult]: ) -> List[CrawlResult]:
""" """
Batch (non-streaming) DFS mode. Crawl level-by-level but emit results at the end.
Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
in control of traversal. Every successful page bumps ``_pages_crawled`` and
seeds new stack items discovered via :meth:`link_discovery`.
""" """
visited: Set[str] = set() visited: Set[str] = set()
# Stack items: (url, parent_url, depth) # Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = [] results: List[CrawlResult] = []
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
""" """
Streaming DFS mode. Same traversal as :meth:`_arun_batch`, but yield pages immediately.
Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
Each popped URL is crawled, its metadata annotated, then the result gets
yielded before we even look at the next stack entry. Successful crawls
still feed :meth:`link_discovery`, keeping DFS order intact.
""" """
visited: Set[str] = set() visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links): for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1) new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth)) stack.append((new_url, new_parent, new_depth))
async def link_discovery(
self,
result: CrawlResult,
source_url: str,
current_depth: int,
_visited: Set[str],
next_level: List[Tuple[str, Optional[str]]],
depths: Dict[str, int],
) -> None:
"""
Find the next URLs we should push onto the DFS stack.
Parameters
----------
result : CrawlResult
Output of the page we just crawled; its ``links`` block is our raw material.
source_url : str
URL of the parent page; stored so callers can track ancestry.
current_depth : int
Depth of the parent; children naturally sit at ``current_depth + 1``.
_visited : Set[str]
Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
next_level : list of tuples
The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
depths : dict
Shared depth map so future metadata tagging knows how deep each URL lives.
Notes
-----
- ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
- Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
"""
next_depth = current_depth + 1
if next_depth > self.max_depth:
return
remaining_capacity = self.max_pages - self._pages_crawled
if remaining_capacity <= 0:
self.logger.info(
f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
)
return
links = result.links.get("internal", [])
if self.include_external:
links += result.links.get("external", [])
seen = self._dfs_seen
valid_links: List[Tuple[str, float]] = []
for link in links:
raw_url = link.get("href")
if not raw_url:
continue
normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
if not normalized_url or normalized_url in seen:
continue
if not await self.can_process_url(raw_url, next_depth):
self.stats.urls_skipped += 1
continue
score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
if score < self.score_threshold:
self.logger.debug(
f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
)
self.stats.urls_skipped += 1
continue
seen.add(normalized_url)
valid_links.append((normalized_url, score))
if len(valid_links) > remaining_capacity:
if self.url_scorer:
valid_links.sort(key=lambda x: x[1], reverse=True)
valid_links = valid_links[:remaining_capacity]
self.logger.info(
f"Limiting to {remaining_capacity} URLs due to max_pages limit"
)
for url, score in valid_links:
if score:
result.metadata = result.metadata or {}
result.metadata["score"] = score
next_level.append((url, source_url))
depths[url] = next_depth

View File

@@ -0,0 +1,39 @@
"""
Simple demonstration of the DFS deep crawler visiting multiple pages.
Run with: python docs/examples/dfs_crawl_demo.py
"""
import asyncio
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.cache_context import CacheMode
from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def main() -> None:
dfs_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=50,
include_external=False,
)
config = CrawlerRunConfig(
deep_crawl_strategy=dfs_strategy,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(),
stream=True,
)
seed_url = "https://docs.python.org/3/" # Plenty of internal links
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
async for result in await crawler.arun(url=seed_url, config=config):
depth = result.metadata.get("depth")
status = "SUCCESS" if result.success else "FAILED"
print(f"[{status}] depth={depth} url={result.url}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,134 +0,0 @@
import sys
from types import SimpleNamespace
import pytest
# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
# optional dependency issues (e.g., incompatible wheels in CI).
class _FakeBM25:
def __init__(self, corpus):
self._scores = [1.0] * len(corpus)
def get_scores(self, tokens):
return self._scores
sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
from crawl4ai.async_url_seeder import AsyncUrlSeeder
class DummyResponse:
def __init__(self, request_url: str, text: str):
self.status_code = 200
self._content = text.encode("utf-8")
self.url = request_url
def raise_for_status(self):
return None
@property
def content(self):
return self._content
@property
def text(self):
return self._content.decode("utf-8")
class DummyAsyncClient:
def __init__(self, response_map):
self._responses = response_map
async def get(self, url, **kwargs):
payload = self._responses[url]
if callable(payload):
payload = payload()
return DummyResponse(url, payload)
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_less_sitemaps():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/b</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/a", "https://example.com/b"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_custom_namespace():
xml = """<?xml version="1.0"?>
<urlset xmlns="https://custom.namespace/schema">
<url><loc>https://example.com/ns</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/ns"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_index_and_children():
index_xml = """<?xml version="1.0"?>
<sitemapindex xmlns="http://another.example/ns">
<sitemap>
<loc>https://example.com/child-1.xml</loc>
</sitemap>
<sitemap>
<loc>https://example.com/child-2.xml</loc>
</sitemap>
</sitemapindex>
"""
child_xml = """<?xml version="1.0"?>
<urlset xmlns="http://irrelevant">
<url><loc>https://example.com/page-{n}</loc></url>
</urlset>
"""
responses = {
"https://example.com/index.xml": index_xml,
"https://example.com/child-1.xml": child_xml.format(n=1),
"https://example.com/child-2.xml": child_xml.format(n=2),
}
seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
urls = []
async for u in seeder._iter_sitemap("https://example.com/index.xml"):
urls.append(u)
assert sorted(urls) == [
"https://example.com/page-1",
"https://example.com/page-2",
]
@pytest.mark.asyncio
async def test_iter_sitemap_normalizes_relative_locations():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>/relative-path</loc></url>
<url><loc>https://example.com/absolute</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == [
"https://example.com/relative-path",
"https://example.com/absolute",
]