feat: add enhanced markdown generation example with citations and file output
This commit is contained in:
@@ -52,34 +52,7 @@ async def download_example():
|
|||||||
else:
|
else:
|
||||||
print("\nNo files were downloaded")
|
print("\nNo files were downloaded")
|
||||||
|
|
||||||
# 2. Content Filtering with BM25 Example
|
# 2. Local File and Raw HTML Processing Example
|
||||||
async def content_filtering_example():
|
|
||||||
"""Example of using the new BM25 content filtering"""
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
||||||
# Create filter with custom query for OpenAI's blog
|
|
||||||
content_filter = BM25ContentFilter(
|
|
||||||
# user_query="Investment and fundraising",
|
|
||||||
# user_query="Robotic",
|
|
||||||
bm25_threshold=1.0
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://techcrunch.com/",
|
|
||||||
content_filter=content_filter,
|
|
||||||
cache_mode=CacheMode.BYPASS
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Filtered content: {len(result.fit_markdown)}")
|
|
||||||
print(f"Filtered content: {result.fit_markdown}")
|
|
||||||
|
|
||||||
# Save html
|
|
||||||
with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
|
|
||||||
f.write(result.fit_html)
|
|
||||||
|
|
||||||
with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
|
|
||||||
f.write(result.fit_markdown)
|
|
||||||
|
|
||||||
# 3. Local File and Raw HTML Processing Example
|
|
||||||
async def local_and_raw_html_example():
|
async def local_and_raw_html_example():
|
||||||
"""Example of processing local files and raw HTML"""
|
"""Example of processing local files and raw HTML"""
|
||||||
# Create a sample HTML file
|
# Create a sample HTML file
|
||||||
@@ -115,6 +88,68 @@ async def local_and_raw_html_example():
|
|||||||
print("Local file content:", local_result.markdown)
|
print("Local file content:", local_result.markdown)
|
||||||
print("\nRaw HTML content:", raw_result.markdown)
|
print("\nRaw HTML content:", raw_result.markdown)
|
||||||
|
|
||||||
|
# 3. Enhanced Markdown Generation Example
|
||||||
|
async def markdown_generation_example():
|
||||||
|
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
# Create a content filter (optional)
|
||||||
|
content_filter = BM25ContentFilter(
|
||||||
|
# user_query="History and cultivation",
|
||||||
|
bm25_threshold=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://en.wikipedia.org/wiki/Apple",
|
||||||
|
css_selector="main div#bodyContent",
|
||||||
|
content_filter=content_filter,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://en.wikipedia.org/wiki/Apple",
|
||||||
|
css_selector="main div#bodyContent",
|
||||||
|
content_filter=BM25ContentFilter()
|
||||||
|
)
|
||||||
|
print(result.markdown_v2.fit_markdown)
|
||||||
|
|
||||||
|
print("\nMarkdown Generation Results:")
|
||||||
|
print(f"1. Original markdown length: {len(result.markdown)}")
|
||||||
|
print(f"2. New markdown versions (markdown_v2):")
|
||||||
|
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||||
|
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||||
|
print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
|
||||||
|
if result.markdown_v2.fit_markdown:
|
||||||
|
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
|
||||||
|
|
||||||
|
# Save examples to files
|
||||||
|
output_dir = os.path.join(__data__, "markdown_examples")
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Save different versions
|
||||||
|
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.raw_markdown)
|
||||||
|
|
||||||
|
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.markdown_with_citations)
|
||||||
|
|
||||||
|
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.references_markdown)
|
||||||
|
|
||||||
|
if result.markdown_v2.fit_markdown:
|
||||||
|
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.fit_markdown)
|
||||||
|
|
||||||
|
print(f"\nMarkdown examples saved to: {output_dir}")
|
||||||
|
|
||||||
|
# Show a sample of citations and references
|
||||||
|
print("\nSample of markdown with citations:")
|
||||||
|
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
||||||
|
print("Sample of references:")
|
||||||
|
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
|
||||||
|
|
||||||
# 4. Browser Management Example
|
# 4. Browser Management Example
|
||||||
async def browser_management_example():
|
async def browser_management_example():
|
||||||
"""Example of using enhanced browser management features"""
|
"""Example of using enhanced browser management features"""
|
||||||
@@ -208,9 +243,13 @@ async def api_example():
|
|||||||
headers=headers
|
headers=headers
|
||||||
) as status_response:
|
) as status_response:
|
||||||
result = await status_response.json()
|
result = await status_response.json()
|
||||||
print(f"Task result: {result}")
|
print(f"Task status: {result['status']}")
|
||||||
|
|
||||||
if result["status"] == "completed":
|
if result["status"] == "completed":
|
||||||
|
print("Task completed!")
|
||||||
|
print("Results:")
|
||||||
|
news = json.loads(result["results"][0]['extracted_content'])
|
||||||
|
print(json.dumps(news[:4], indent=2))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
@@ -220,15 +259,15 @@ async def main():
|
|||||||
# print("Running Crawl4AI feature examples...")
|
# print("Running Crawl4AI feature examples...")
|
||||||
|
|
||||||
# print("\n1. Running Download Example:")
|
# print("\n1. Running Download Example:")
|
||||||
await download_example()
|
# await download_example()
|
||||||
|
|
||||||
# print("\n2. Running Content Filtering Example:")
|
# print("\n2. Running Markdown Generation Example:")
|
||||||
await content_filtering_example()
|
# await markdown_generation_example()
|
||||||
|
|
||||||
# print("\n3. Running Local and Raw HTML Example:")
|
# # print("\n3. Running Local and Raw HTML Example:")
|
||||||
await local_and_raw_html_example()
|
# await local_and_raw_html_example()
|
||||||
|
|
||||||
# print("\n4. Running Browser Management Example:")
|
# # print("\n4. Running Browser Management Example:")
|
||||||
await browser_management_example()
|
await browser_management_example()
|
||||||
|
|
||||||
# print("\n5. Running API Example:")
|
# print("\n5. Running API Example:")
|
||||||
|
|||||||
Reference in New Issue
Block a user