Merge PR #1667: Fix deep-crawl CLI outputting only the first page

This commit is contained in:
unclecode
2026-02-01 06:21:25 +00:00
2 changed files with 153 additions and 5 deletions

View File

@@ -1235,9 +1235,21 @@ Always return valid, properly formatted JSON."""
click.echo(json.dumps(extracted_items, indent=2))
elif output in ["markdown", "md"]:
click.echo(main_result.markdown.raw_markdown)
if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.raw_markdown)
else:
click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(main_result.markdown.fit_markdown)
if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.fit_markdown)
else:
click.echo(main_result.markdown.fit_markdown)
else:
if output == "all":
with open(output_file, "w") as f:
@@ -1251,10 +1263,22 @@ Always return valid, properly formatted JSON."""
f.write(main_result.extracted_content)
elif output in ["markdown", "md"]:
with open(output_file, "w") as f:
f.write(main_result.markdown.raw_markdown)
if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.raw_markdown)
else:
f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f:
f.write(main_result.markdown.fit_markdown)
if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.fit_markdown)
else:
f.write(main_result.markdown.fit_markdown)
except Exception as e:
raise click.ClickException(str(e))

View File

@@ -1,9 +1,11 @@
import pytest
from click.testing import CliRunner
from pathlib import Path
from unittest.mock import patch
import json
import yaml
from crawl4ai.cli import cli, load_config_file, parse_key_values
from crawl4ai.models import CrawlResult, MarkdownGenerationResult
import tempfile
import os
import click
@@ -129,5 +131,127 @@ class TestErrorHandling:
])
assert result.exit_code != 0
class TestDeepCrawlOutput:
"""Tests for deep crawl output formatting"""
@pytest.fixture
def mock_crawl_results(self):
"""Create mock CrawlResult objects simulating deep crawl results"""
def make_result(url, content):
markdown = MarkdownGenerationResult(
raw_markdown=content,
markdown_with_citations=content,
references_markdown="",
fit_markdown=content,
)
result = CrawlResult(
url=url,
html=f"<html>{content}</html>",
success=True,
metadata={"depth": 0},
)
result._markdown = markdown
return result
return [
make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."),
make_result("https://example.com/about", "# About\n\nAbout us page content."),
make_result("https://example.com/contact", "# Contact\n\nContact information."),
]
def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown output includes all pages, not just the first"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return list of results (simulating deep crawl)
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain content from ALL pages
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output
assert 'Homepage' in result.output
assert 'About us page content' in result.output
assert 'Contact information' in result.output
def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown-fit output includes all pages"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown-fit'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain all URLs
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output
def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path):
"""Test that deep crawl with file output includes all pages"""
output_file = tmp_path / "output.md"
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown',
'-O', str(output_file)
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
content = output_file.read_text()
# Should contain content from ALL pages
assert 'https://example.com/' in content
assert 'https://example.com/about' in content
assert 'https://example.com/contact' in content
def test_single_crawl_markdown_output_unchanged(self, runner):
"""Test that single (non-deep) crawl still works correctly"""
markdown = MarkdownGenerationResult(
raw_markdown="# Single Page\n\nContent here.",
markdown_with_citations="# Single Page\n\nContent here.",
references_markdown="",
)
single_result = CrawlResult(
url="https://example.com/",
html="<html>test</html>",
success=True,
)
single_result._markdown = markdown
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return single result (not a list)
mock_anyio_run.return_value = single_result
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'-o', 'markdown'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
assert '# Single Page' in result.output
assert 'Content here' in result.output
if __name__ == '__main__':
pytest.main(['-v', '-s', '--tb=native', __file__])
pytest.main(['-v', '-s', '--tb=native', __file__])