In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction.
This commit is contained in:
@@ -13,8 +13,8 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__f
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
|
||||
@dataclass
|
||||
|
||||
165
tests/async/test_markdown_genertor.py
Normal file
165
tests/async/test_markdown_genertor.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# ## Issue #236
|
||||
# - **Last Updated:** 2024-11-11 01:42:14
|
||||
# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236)
|
||||
# - **State:** open
|
||||
|
||||
import os, sys, time
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy
|
||||
|
||||
# Get current directory
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
|
||||
"""Helper function to print test results."""
|
||||
print(f"\n{'='*20} {name} {'='*20}")
|
||||
print(f"Execution time: {execution_time:.4f} seconds")
|
||||
|
||||
|
||||
# Save markdown to files
|
||||
for key, content in result.items():
|
||||
if isinstance(content, str):
|
||||
with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
|
||||
f.write(content)
|
||||
|
||||
# # Print first few lines of each markdown version
|
||||
# for key, content in result.items():
|
||||
# if isinstance(content, str):
|
||||
# preview = '\n'.join(content.split('\n')[:3])
|
||||
# print(f"\n{key} (first 3 lines):")
|
||||
# print(preview)
|
||||
# print(f"Total length: {len(content)} characters")
|
||||
|
||||
def test_basic_markdown_conversion():
|
||||
"""Test basic markdown conversion with links."""
|
||||
with open(__location__ + "/data/wikipedia.html", "r") as f:
|
||||
cleaned_html = f.read()
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
execution_time = time.perf_counter() - start_time
|
||||
|
||||
print_test_result("Basic Markdown Conversion", {
|
||||
'raw': result.raw_markdown,
|
||||
'with_citations': result.markdown_with_citations,
|
||||
'references': result.references_markdown
|
||||
}, execution_time)
|
||||
|
||||
# Basic assertions
|
||||
assert result.raw_markdown, "Raw markdown should not be empty"
|
||||
assert result.markdown_with_citations, "Markdown with citations should not be empty"
|
||||
assert result.references_markdown, "References should not be empty"
|
||||
assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
|
||||
assert "## References" in result.references_markdown, "Should contain references section"
|
||||
|
||||
def test_relative_links():
|
||||
"""Test handling of relative links with base URL."""
|
||||
markdown = """
|
||||
Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
|
||||
Also an [image](/images/test.png) and another [page](/wiki/Banana).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
|
||||
assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
|
||||
assert "https://example.com" in result.references_markdown
|
||||
assert "https://en.wikipedia.org/images/test.png" in result.references_markdown
|
||||
|
||||
def test_duplicate_links():
|
||||
"""Test handling of duplicate links."""
|
||||
markdown = """
|
||||
Here's a [link](/test) and another [link](/test) and a [different link](/other).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
# Count citations in markdown
|
||||
citations = result.markdown_with_citations.count("⟨1⟩")
|
||||
assert citations == 2, "Same link should use same citation number"
|
||||
|
||||
def test_link_descriptions():
|
||||
"""Test handling of link titles and descriptions."""
|
||||
markdown = """
|
||||
Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
assert "Test Title" in result.references_markdown, "Link title should be in references"
|
||||
assert "link with description" in result.references_markdown, "Link text should be in references"
|
||||
|
||||
def test_performance_large_document():
|
||||
"""Test performance with large document."""
|
||||
with open(__location__ + "/data/wikipedia.md", "r") as f:
|
||||
markdown = f.read()
|
||||
|
||||
# Test with multiple iterations
|
||||
iterations = 5
|
||||
times = []
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
|
||||
for i in range(iterations):
|
||||
start_time = time.perf_counter()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
times.append(end_time - start_time)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
print(f"\n{'='*20} Performance Test {'='*20}")
|
||||
print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
|
||||
print(f"Min time: {min(times):.4f} seconds")
|
||||
print(f"Max time: {max(times):.4f} seconds")
|
||||
|
||||
def test_image_links():
|
||||
"""Test handling of image links."""
|
||||
markdown = """
|
||||
Here's an  and another .
|
||||
And a regular [link](/page).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
|
||||
assert "Image Title" in result.references_markdown, "Image title should be in references"
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Running markdown generation strategy tests...")
|
||||
|
||||
test_basic_markdown_conversion()
|
||||
test_relative_links()
|
||||
test_duplicate_links()
|
||||
test_link_descriptions()
|
||||
test_performance_large_document()
|
||||
test_image_links()
|
||||
|
||||
Reference in New Issue
Block a user