In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction.

This commit is contained in:
UncleCode
2024-11-21 18:21:43 +08:00
parent 7047422e48
commit dbb751c8f0
12 changed files with 506 additions and 762 deletions

View File

@@ -13,8 +13,8 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__f
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
@dataclass

View File

@@ -0,0 +1,165 @@
# ## Issue #236
# - **Last Updated:** 2024-11-11 01:42:14
# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236)
# - **State:** open
import os, sys, time
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))
import asyncio
import os
import time
from typing import Dict, Any
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy
# Get current directory
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
"""Helper function to print test results."""
print(f"\n{'='*20} {name} {'='*20}")
print(f"Execution time: {execution_time:.4f} seconds")
# Save markdown to files
for key, content in result.items():
if isinstance(content, str):
with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
f.write(content)
# # Print first few lines of each markdown version
# for key, content in result.items():
# if isinstance(content, str):
# preview = '\n'.join(content.split('\n')[:3])
# print(f"\n{key} (first 3 lines):")
# print(preview)
# print(f"Total length: {len(content)} characters")
def test_basic_markdown_conversion():
"""Test basic markdown conversion with links."""
with open(__location__ + "/data/wikipedia.html", "r") as f:
cleaned_html = f.read()
generator = DefaultMarkdownGenerationStrategy()
start_time = time.perf_counter()
result = generator.generate_markdown(
cleaned_html=cleaned_html,
base_url="https://en.wikipedia.org"
)
execution_time = time.perf_counter() - start_time
print_test_result("Basic Markdown Conversion", {
'raw': result.raw_markdown,
'with_citations': result.markdown_with_citations,
'references': result.references_markdown
}, execution_time)
# Basic assertions
assert result.raw_markdown, "Raw markdown should not be empty"
assert result.markdown_with_citations, "Markdown with citations should not be empty"
assert result.references_markdown, "References should not be empty"
assert "" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
assert "## References" in result.references_markdown, "Should contain references section"
def test_relative_links():
"""Test handling of relative links with base URL."""
markdown = """
Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
Also an [image](/images/test.png) and another [page](/wiki/Banana).
"""
generator = DefaultMarkdownGenerationStrategy()
result = generator.generate_markdown(
cleaned_html=markdown,
base_url="https://en.wikipedia.org"
)
assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
assert "https://example.com" in result.references_markdown
assert "https://en.wikipedia.org/images/test.png" in result.references_markdown
def test_duplicate_links():
"""Test handling of duplicate links."""
markdown = """
Here's a [link](/test) and another [link](/test) and a [different link](/other).
"""
generator = DefaultMarkdownGenerationStrategy()
result = generator.generate_markdown(
cleaned_html=markdown,
base_url="https://example.com"
)
# Count citations in markdown
citations = result.markdown_with_citations.count("⟨1⟩")
assert citations == 2, "Same link should use same citation number"
def test_link_descriptions():
"""Test handling of link titles and descriptions."""
markdown = """
Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
"""
generator = DefaultMarkdownGenerationStrategy()
result = generator.generate_markdown(
cleaned_html=markdown,
base_url="https://example.com"
)
assert "Test Title" in result.references_markdown, "Link title should be in references"
assert "link with description" in result.references_markdown, "Link text should be in references"
def test_performance_large_document():
"""Test performance with large document."""
with open(__location__ + "/data/wikipedia.md", "r") as f:
markdown = f.read()
# Test with multiple iterations
iterations = 5
times = []
generator = DefaultMarkdownGenerationStrategy()
for i in range(iterations):
start_time = time.perf_counter()
result = generator.generate_markdown(
cleaned_html=markdown,
base_url="https://en.wikipedia.org"
)
end_time = time.perf_counter()
times.append(end_time - start_time)
avg_time = sum(times) / len(times)
print(f"\n{'='*20} Performance Test {'='*20}")
print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
print(f"Min time: {min(times):.4f} seconds")
print(f"Max time: {max(times):.4f} seconds")
def test_image_links():
"""Test handling of image links."""
markdown = """
Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg).
And a regular [link](/page).
"""
generator = DefaultMarkdownGenerationStrategy()
result = generator.generate_markdown(
cleaned_html=markdown,
base_url="https://example.com"
)
assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
assert "Image Title" in result.references_markdown, "Image title should be in references"
if __name__ == "__main__":
print("Running markdown generation strategy tests...")
test_basic_markdown_conversion()
test_relative_links()
test_duplicate_links()
test_link_descriptions()
test_performance_large_document()
test_image_links()