Merge PR #1667: Fix deep-crawl CLI outputting only the first page
This commit is contained in:
@@ -1235,9 +1235,21 @@ Always return valid, properly formatted JSON."""
|
||||
click.echo(json.dumps(extracted_items, indent=2))
|
||||
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
if isinstance(result, list):
|
||||
# Combine markdown from all crawled pages for deep crawl
|
||||
for r in all_results:
|
||||
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||
click.echo(r.markdown.raw_markdown)
|
||||
else:
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
if isinstance(result, list):
|
||||
# Combine fit markdown from all crawled pages for deep crawl
|
||||
for r in all_results:
|
||||
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||
click.echo(r.markdown.fit_markdown)
|
||||
else:
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
else:
|
||||
if output == "all":
|
||||
with open(output_file, "w") as f:
|
||||
@@ -1251,10 +1263,22 @@ Always return valid, properly formatted JSON."""
|
||||
f.write(main_result.extracted_content)
|
||||
elif output in ["markdown", "md"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
if isinstance(result, list):
|
||||
# Combine markdown from all crawled pages for deep crawl
|
||||
for r in all_results:
|
||||
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||
f.write(r.markdown.raw_markdown)
|
||||
else:
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
if isinstance(result, list):
|
||||
# Combine fit markdown from all crawled pages for deep crawl
|
||||
for r in all_results:
|
||||
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||
f.write(r.markdown.fit_markdown)
|
||||
else:
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
import json
|
||||
import yaml
|
||||
from crawl4ai.cli import cli, load_config_file, parse_key_values
|
||||
from crawl4ai.models import CrawlResult, MarkdownGenerationResult
|
||||
import tempfile
|
||||
import os
|
||||
import click
|
||||
@@ -129,5 +131,127 @@ class TestErrorHandling:
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
class TestDeepCrawlOutput:
|
||||
"""Tests for deep crawl output formatting"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crawl_results(self):
|
||||
"""Create mock CrawlResult objects simulating deep crawl results"""
|
||||
def make_result(url, content):
|
||||
markdown = MarkdownGenerationResult(
|
||||
raw_markdown=content,
|
||||
markdown_with_citations=content,
|
||||
references_markdown="",
|
||||
fit_markdown=content,
|
||||
)
|
||||
result = CrawlResult(
|
||||
url=url,
|
||||
html=f"<html>{content}</html>",
|
||||
success=True,
|
||||
metadata={"depth": 0},
|
||||
)
|
||||
result._markdown = markdown
|
||||
return result
|
||||
|
||||
return [
|
||||
make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."),
|
||||
make_result("https://example.com/about", "# About\n\nAbout us page content."),
|
||||
make_result("https://example.com/contact", "# Contact\n\nContact information."),
|
||||
]
|
||||
|
||||
def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results):
|
||||
"""Test that deep crawl with markdown output includes all pages, not just the first"""
|
||||
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||
# Return list of results (simulating deep crawl)
|
||||
mock_anyio_run.return_value = mock_crawl_results
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'crawl',
|
||||
'https://example.com',
|
||||
'--deep-crawl', 'bfs',
|
||||
'--max-pages', '3',
|
||||
'-o', 'markdown'
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||
# Should contain content from ALL pages
|
||||
assert 'https://example.com/' in result.output
|
||||
assert 'https://example.com/about' in result.output
|
||||
assert 'https://example.com/contact' in result.output
|
||||
assert 'Homepage' in result.output
|
||||
assert 'About us page content' in result.output
|
||||
assert 'Contact information' in result.output
|
||||
|
||||
def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results):
|
||||
"""Test that deep crawl with markdown-fit output includes all pages"""
|
||||
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||
mock_anyio_run.return_value = mock_crawl_results
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'crawl',
|
||||
'https://example.com',
|
||||
'--deep-crawl', 'bfs',
|
||||
'--max-pages', '3',
|
||||
'-o', 'markdown-fit'
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||
# Should contain all URLs
|
||||
assert 'https://example.com/' in result.output
|
||||
assert 'https://example.com/about' in result.output
|
||||
assert 'https://example.com/contact' in result.output
|
||||
|
||||
def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path):
|
||||
"""Test that deep crawl with file output includes all pages"""
|
||||
output_file = tmp_path / "output.md"
|
||||
|
||||
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||
mock_anyio_run.return_value = mock_crawl_results
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'crawl',
|
||||
'https://example.com',
|
||||
'--deep-crawl', 'bfs',
|
||||
'--max-pages', '3',
|
||||
'-o', 'markdown',
|
||||
'-O', str(output_file)
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||
content = output_file.read_text()
|
||||
# Should contain content from ALL pages
|
||||
assert 'https://example.com/' in content
|
||||
assert 'https://example.com/about' in content
|
||||
assert 'https://example.com/contact' in content
|
||||
|
||||
def test_single_crawl_markdown_output_unchanged(self, runner):
|
||||
"""Test that single (non-deep) crawl still works correctly"""
|
||||
markdown = MarkdownGenerationResult(
|
||||
raw_markdown="# Single Page\n\nContent here.",
|
||||
markdown_with_citations="# Single Page\n\nContent here.",
|
||||
references_markdown="",
|
||||
)
|
||||
single_result = CrawlResult(
|
||||
url="https://example.com/",
|
||||
html="<html>test</html>",
|
||||
success=True,
|
||||
)
|
||||
single_result._markdown = markdown
|
||||
|
||||
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||
# Return single result (not a list)
|
||||
mock_anyio_run.return_value = single_result
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'crawl',
|
||||
'https://example.com',
|
||||
'-o', 'markdown'
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||
assert '# Single Page' in result.output
|
||||
assert 'Content here' in result.output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main(['-v', '-s', '--tb=native', __file__])
|
||||
pytest.main(['-v', '-s', '--tb=native', __file__])
|
||||
|
||||
Reference in New Issue
Block a user