When using --deep-crawl, output all pages, not just the first one.
This commit is contained in:
@@ -1230,9 +1230,21 @@ Always return valid, properly formatted JSON."""
|
|||||||
click.echo(json.dumps(extracted_items, indent=2))
|
click.echo(json.dumps(extracted_items, indent=2))
|
||||||
|
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
click.echo(main_result.markdown.raw_markdown)
|
if isinstance(result, list):
|
||||||
|
# Combine markdown from all crawled pages for deep crawl
|
||||||
|
for r in all_results:
|
||||||
|
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||||
|
click.echo(r.markdown.raw_markdown)
|
||||||
|
else:
|
||||||
|
click.echo(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
click.echo(main_result.markdown.fit_markdown)
|
if isinstance(result, list):
|
||||||
|
# Combine fit markdown from all crawled pages for deep crawl
|
||||||
|
for r in all_results:
|
||||||
|
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||||
|
click.echo(r.markdown.fit_markdown)
|
||||||
|
else:
|
||||||
|
click.echo(main_result.markdown.fit_markdown)
|
||||||
else:
|
else:
|
||||||
if output == "all":
|
if output == "all":
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
@@ -1246,10 +1258,22 @@ Always return valid, properly formatted JSON."""
|
|||||||
f.write(main_result.extracted_content)
|
f.write(main_result.extracted_content)
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(main_result.markdown.raw_markdown)
|
if isinstance(result, list):
|
||||||
|
# Combine markdown from all crawled pages for deep crawl
|
||||||
|
for r in all_results:
|
||||||
|
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||||
|
f.write(r.markdown.raw_markdown)
|
||||||
|
else:
|
||||||
|
f.write(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(main_result.markdown.fit_markdown)
|
if isinstance(result, list):
|
||||||
|
# Combine fit markdown from all crawled pages for deep crawl
|
||||||
|
for r in all_results:
|
||||||
|
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
|
||||||
|
f.write(r.markdown.fit_markdown)
|
||||||
|
else:
|
||||||
|
f.write(main_result.markdown.fit_markdown)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise click.ClickException(str(e))
|
raise click.ClickException(str(e))
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
import json
|
import json
|
||||||
import yaml
|
import yaml
|
||||||
from crawl4ai.cli import cli, load_config_file, parse_key_values
|
from crawl4ai.cli import cli, load_config_file, parse_key_values
|
||||||
|
from crawl4ai.models import CrawlResult, MarkdownGenerationResult
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
import click
|
import click
|
||||||
@@ -129,5 +131,127 @@ class TestErrorHandling:
|
|||||||
])
|
])
|
||||||
assert result.exit_code != 0
|
assert result.exit_code != 0
|
||||||
|
|
||||||
|
class TestDeepCrawlOutput:
|
||||||
|
"""Tests for deep crawl output formatting"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_crawl_results(self):
|
||||||
|
"""Create mock CrawlResult objects simulating deep crawl results"""
|
||||||
|
def make_result(url, content):
|
||||||
|
markdown = MarkdownGenerationResult(
|
||||||
|
raw_markdown=content,
|
||||||
|
markdown_with_citations=content,
|
||||||
|
references_markdown="",
|
||||||
|
fit_markdown=content,
|
||||||
|
)
|
||||||
|
result = CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=f"<html>{content}</html>",
|
||||||
|
success=True,
|
||||||
|
metadata={"depth": 0},
|
||||||
|
)
|
||||||
|
result._markdown = markdown
|
||||||
|
return result
|
||||||
|
|
||||||
|
return [
|
||||||
|
make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."),
|
||||||
|
make_result("https://example.com/about", "# About\n\nAbout us page content."),
|
||||||
|
make_result("https://example.com/contact", "# Contact\n\nContact information."),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results):
|
||||||
|
"""Test that deep crawl with markdown output includes all pages, not just the first"""
|
||||||
|
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||||
|
# Return list of results (simulating deep crawl)
|
||||||
|
mock_anyio_run.return_value = mock_crawl_results
|
||||||
|
|
||||||
|
result = runner.invoke(cli, [
|
||||||
|
'crawl',
|
||||||
|
'https://example.com',
|
||||||
|
'--deep-crawl', 'bfs',
|
||||||
|
'--max-pages', '3',
|
||||||
|
'-o', 'markdown'
|
||||||
|
])
|
||||||
|
|
||||||
|
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||||
|
# Should contain content from ALL pages
|
||||||
|
assert 'https://example.com/' in result.output
|
||||||
|
assert 'https://example.com/about' in result.output
|
||||||
|
assert 'https://example.com/contact' in result.output
|
||||||
|
assert 'Homepage' in result.output
|
||||||
|
assert 'About us page content' in result.output
|
||||||
|
assert 'Contact information' in result.output
|
||||||
|
|
||||||
|
def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results):
|
||||||
|
"""Test that deep crawl with markdown-fit output includes all pages"""
|
||||||
|
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||||
|
mock_anyio_run.return_value = mock_crawl_results
|
||||||
|
|
||||||
|
result = runner.invoke(cli, [
|
||||||
|
'crawl',
|
||||||
|
'https://example.com',
|
||||||
|
'--deep-crawl', 'bfs',
|
||||||
|
'--max-pages', '3',
|
||||||
|
'-o', 'markdown-fit'
|
||||||
|
])
|
||||||
|
|
||||||
|
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||||
|
# Should contain all URLs
|
||||||
|
assert 'https://example.com/' in result.output
|
||||||
|
assert 'https://example.com/about' in result.output
|
||||||
|
assert 'https://example.com/contact' in result.output
|
||||||
|
|
||||||
|
def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path):
|
||||||
|
"""Test that deep crawl with file output includes all pages"""
|
||||||
|
output_file = tmp_path / "output.md"
|
||||||
|
|
||||||
|
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||||
|
mock_anyio_run.return_value = mock_crawl_results
|
||||||
|
|
||||||
|
result = runner.invoke(cli, [
|
||||||
|
'crawl',
|
||||||
|
'https://example.com',
|
||||||
|
'--deep-crawl', 'bfs',
|
||||||
|
'--max-pages', '3',
|
||||||
|
'-o', 'markdown',
|
||||||
|
'-O', str(output_file)
|
||||||
|
])
|
||||||
|
|
||||||
|
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||||
|
content = output_file.read_text()
|
||||||
|
# Should contain content from ALL pages
|
||||||
|
assert 'https://example.com/' in content
|
||||||
|
assert 'https://example.com/about' in content
|
||||||
|
assert 'https://example.com/contact' in content
|
||||||
|
|
||||||
|
def test_single_crawl_markdown_output_unchanged(self, runner):
|
||||||
|
"""Test that single (non-deep) crawl still works correctly"""
|
||||||
|
markdown = MarkdownGenerationResult(
|
||||||
|
raw_markdown="# Single Page\n\nContent here.",
|
||||||
|
markdown_with_citations="# Single Page\n\nContent here.",
|
||||||
|
references_markdown="",
|
||||||
|
)
|
||||||
|
single_result = CrawlResult(
|
||||||
|
url="https://example.com/",
|
||||||
|
html="<html>test</html>",
|
||||||
|
success=True,
|
||||||
|
)
|
||||||
|
single_result._markdown = markdown
|
||||||
|
|
||||||
|
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
|
||||||
|
# Return single result (not a list)
|
||||||
|
mock_anyio_run.return_value = single_result
|
||||||
|
|
||||||
|
result = runner.invoke(cli, [
|
||||||
|
'crawl',
|
||||||
|
'https://example.com',
|
||||||
|
'-o', 'markdown'
|
||||||
|
])
|
||||||
|
|
||||||
|
assert result.exit_code == 0, f"CLI failed with: {result.output}"
|
||||||
|
assert '# Single Page' in result.output
|
||||||
|
assert 'Content here' in result.output
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main(['-v', '-s', '--tb=native', __file__])
|
pytest.main(['-v', '-s', '--tb=native', __file__])
|
||||||
Reference in New Issue
Block a user