When using --deep-crawl, output all pages, not just the first one.

This commit is contained in:
Christian Oudard
2025-12-10 10:12:01 -07:00
parent 3a07c5962c
commit 220a2246d3
2 changed files with 153 additions and 5 deletions

View File

@@ -1230,9 +1230,21 @@ Always return valid, properly formatted JSON."""
click.echo(json.dumps(extracted_items, indent=2)) click.echo(json.dumps(extracted_items, indent=2))
elif output in ["markdown", "md"]: elif output in ["markdown", "md"]:
click.echo(main_result.markdown.raw_markdown) if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.raw_markdown)
else:
click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]: elif output in ["markdown-fit", "md-fit"]:
click.echo(main_result.markdown.fit_markdown) if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.fit_markdown)
else:
click.echo(main_result.markdown.fit_markdown)
else: else:
if output == "all": if output == "all":
with open(output_file, "w") as f: with open(output_file, "w") as f:
@@ -1246,10 +1258,22 @@ Always return valid, properly formatted JSON."""
f.write(main_result.extracted_content) f.write(main_result.extracted_content)
elif output in ["markdown", "md"]: elif output in ["markdown", "md"]:
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(main_result.markdown.raw_markdown) if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.raw_markdown)
else:
f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]: elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(main_result.markdown.fit_markdown) if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.fit_markdown)
else:
f.write(main_result.markdown.fit_markdown)
except Exception as e: except Exception as e:
raise click.ClickException(str(e)) raise click.ClickException(str(e))

View File

@@ -1,9 +1,11 @@
import pytest import pytest
from click.testing import CliRunner from click.testing import CliRunner
from pathlib import Path from pathlib import Path
from unittest.mock import patch
import json import json
import yaml import yaml
from crawl4ai.cli import cli, load_config_file, parse_key_values from crawl4ai.cli import cli, load_config_file, parse_key_values
from crawl4ai.models import CrawlResult, MarkdownGenerationResult
import tempfile import tempfile
import os import os
import click import click
@@ -129,5 +131,127 @@ class TestErrorHandling:
]) ])
assert result.exit_code != 0 assert result.exit_code != 0
class TestDeepCrawlOutput:
"""Tests for deep crawl output formatting"""
@pytest.fixture
def mock_crawl_results(self):
"""Create mock CrawlResult objects simulating deep crawl results"""
def make_result(url, content):
markdown = MarkdownGenerationResult(
raw_markdown=content,
markdown_with_citations=content,
references_markdown="",
fit_markdown=content,
)
result = CrawlResult(
url=url,
html=f"<html>{content}</html>",
success=True,
metadata={"depth": 0},
)
result._markdown = markdown
return result
return [
make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."),
make_result("https://example.com/about", "# About\n\nAbout us page content."),
make_result("https://example.com/contact", "# Contact\n\nContact information."),
]
def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown output includes all pages, not just the first"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return list of results (simulating deep crawl)
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain content from ALL pages
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output
assert 'Homepage' in result.output
assert 'About us page content' in result.output
assert 'Contact information' in result.output
def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown-fit output includes all pages"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown-fit'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain all URLs
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output
def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path):
"""Test that deep crawl with file output includes all pages"""
output_file = tmp_path / "output.md"
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown',
'-O', str(output_file)
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
content = output_file.read_text()
# Should contain content from ALL pages
assert 'https://example.com/' in content
assert 'https://example.com/about' in content
assert 'https://example.com/contact' in content
def test_single_crawl_markdown_output_unchanged(self, runner):
"""Test that single (non-deep) crawl still works correctly"""
markdown = MarkdownGenerationResult(
raw_markdown="# Single Page\n\nContent here.",
markdown_with_citations="# Single Page\n\nContent here.",
references_markdown="",
)
single_result = CrawlResult(
url="https://example.com/",
html="<html>test</html>",
success=True,
)
single_result._markdown = markdown
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return single result (not a list)
mock_anyio_run.return_value = single_result
result = runner.invoke(cli, [
'crawl',
'https://example.com',
'-o', 'markdown'
])
assert result.exit_code == 0, f"CLI failed with: {result.output}"
assert '# Single Page' in result.output
assert 'Content here' in result.output
if __name__ == '__main__': if __name__ == '__main__':
pytest.main(['-v', '-s', '--tb=native', __file__]) pytest.main(['-v', '-s', '--tb=native', __file__])