refactor(docs): update import paths and clean up example code in quickstart_examples.py

This commit is contained in:
UncleCode
2025-04-05 22:55:56 +08:00
parent 935d9d39f8
commit ca9351252a

View File

@@ -5,7 +5,7 @@ import base64
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai.configs import ProxyConfig from crawl4ai.proxy_strategy import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy from crawl4ai import RoundRobinProxyStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
@@ -19,10 +19,9 @@ __cur_dir__ = Path(__file__).parent
async def demo_basic_crawl(): async def demo_basic_crawl():
"""Basic web crawling with markdown generation""" """Basic web crawling with markdown generation"""
print("\n=== 1. Basic Web Crawling ===") print("\n=== 1. Basic Web Crawling ===")
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun( results: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com/", url="https://news.ycombinator.com/"
) )
for i, result in enumerate(results): for i, result in enumerate(results):
@@ -34,7 +33,6 @@ async def demo_basic_crawl():
else: else:
print("Failed to crawl the URL") print("Failed to crawl the URL")
async def demo_parallel_crawl(): async def demo_parallel_crawl():
"""Crawl multiple URLs in parallel""" """Crawl multiple URLs in parallel"""
print("\n=== 2. Parallel Crawling ===") print("\n=== 2. Parallel Crawling ===")
@@ -56,14 +54,13 @@ async def demo_parallel_crawl():
f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}" f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
) )
async def demo_fit_markdown(): async def demo_fit_markdown():
"""Generate focused markdown with LLM content filter""" """Generate focused markdown with LLM content filter"""
print("\n=== 3. Fit Markdown with LLM Content Filter ===") print("\n=== 3. Fit Markdown with LLM Content Filter ===")
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result: CrawlResult = await crawler.arun(
"https://en.wikipedia.org/wiki/Python_(programming_language)", url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
config=CrawlerRunConfig( config=CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter() content_filter=PruningContentFilter()
@@ -75,7 +72,6 @@ async def demo_fit_markdown():
print(f"Raw: {len(result.markdown.raw_markdown)} chars") print(f"Raw: {len(result.markdown.raw_markdown)} chars")
print(f"Fit: {len(result.markdown.fit_markdown)} chars") print(f"Fit: {len(result.markdown.fit_markdown)} chars")
async def demo_llm_structured_extraction_no_schema(): async def demo_llm_structured_extraction_no_schema():
# Create a simple LLM extraction strategy (no schema required) # Create a simple LLM extraction strategy (no schema required)
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
@@ -83,7 +79,7 @@ async def demo_llm_structured_extraction_no_schema():
provider="groq/qwen-2.5-32b", provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY", api_token="env:GROQ_API_KEY",
), ),
instruction="This is news.ycombinator.com, extract all news for each. title, source url, number of comments.", instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
extract_type="schema", extract_type="schema",
schema="{title: string, url: string, comments: int}", schema="{title: string, url: string, comments: int}",
extra_args={ extra_args={
@@ -109,7 +105,6 @@ async def demo_llm_structured_extraction_no_schema():
else: else:
print("Failed to extract structured data") print("Failed to extract structured data")
async def demo_css_structured_extraction_no_schema(): async def demo_css_structured_extraction_no_schema():
"""Extract structured data using CSS selectors""" """Extract structured data using CSS selectors"""
print("\n=== 5. CSS-Based Structured Extraction ===") print("\n=== 5. CSS-Based Structured Extraction ===")
@@ -129,27 +124,33 @@ async def demo_css_structured_extraction_no_schema():
<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span> <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
<span class="h-tags">Malware / Supply Chain Attack</span> <span class="h-tags">Malware / Supply Chain Attack</span>
</div> </div>
<div class="home-desc"> Cybersecurity researchers have uncovered malicious libraries in the Python Package Index (PyPI) repository that are designed to steal sensitive information. Two of the packages, bitcoinlibdbfix and bitcoinlib-dev, masquerade as fixes for recent issues detected in a legitimate Python module called bitcoinlib, according to ReversingLabs . A third package discovered by Socket, disgrasya, contained a fully automated carding script targeting WooCommerce stores. The packages attracted hundreds of downloads before being taken down, according to statistics from pepy.tech - bitcoinlibdbfix - 1,101 downloads bitcoinlib-dev - 735 downloads disgrasya - 37,217 downloads "The malicious libraries both attempt a similar attack, overwriting the legitimate 'clw cli' command with malicious code that attempts to exfiltrate sensitive database files," ReversingLabs said. In an interesting twist, the authors of the counterfeit libraries are said to have joined a GitHub issue...</div> <div class="home-desc"> Cybersecurity researchers have...</div>
</div> </div>
</div> </div>
</a> </a>
</div> </div>
""" """
# Generate schema using LLM (one-time setup) # Check if schema file exists
schema = JsonCssExtractionStrategy.generate_schema( schema_file_path = f"{__cur_dir__}/tmp/schema.json"
html=sample_html, if os.path.exists(schema_file_path):
llm_config=LLMConfig( with open(schema_file_path, "r") as f:
provider="groq/qwen-2.5-32b", schema = json.load(f)
api_token="env:GROQ_API_KEY", else:
), # Generate schema using LLM (one-time setup)
query="From https://thehackernews.com/, I have shares a sample of one news div with a title, date, and description. Please generate a schema for this news div.", schema = JsonCssExtractionStrategy.generate_schema(
) html=sample_html,
llm_config=LLMConfig(
provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY",
),
query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
)
print(f"Generated schema: {json.dumps(schema, indent=2)}") print(f"Generated schema: {json.dumps(schema, indent=2)}")
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
# with open("schema.json", "w") as f: with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
# json.dump(schema, f, indent=2) json.dump(schema, f, indent=2)
# Create no-LLM extraction strategy with the generated schema # Create no-LLM extraction strategy with the generated schema
extraction_strategy = JsonCssExtractionStrategy(schema) extraction_strategy = JsonCssExtractionStrategy(schema)
@@ -170,7 +171,6 @@ async def demo_css_structured_extraction_no_schema():
else: else:
print("Failed to extract structured data") print("Failed to extract structured data")
async def demo_deep_crawl(): async def demo_deep_crawl():
"""Deep crawling with BFS strategy""" """Deep crawling with BFS strategy"""
print("\n=== 6. Deep Crawling ===") print("\n=== 6. Deep Crawling ===")
@@ -192,7 +192,6 @@ async def demo_deep_crawl():
depth = result.metadata.get("depth", "unknown") depth = result.metadata.get("depth", "unknown")
print(f" {i + 1}. {result.url} (Depth: {depth})") print(f" {i + 1}. {result.url} (Depth: {depth})")
async def demo_js_interaction(): async def demo_js_interaction():
"""Execute JavaScript to load more content""" """Execute JavaScript to load more content"""
print("\n=== 7. JavaScript Interaction ===") print("\n=== 7. JavaScript Interaction ===")
@@ -255,8 +254,6 @@ async def demo_js_interaction():
print("Failed to extract structured data") print("Failed to extract structured data")
print(f"Total items: {len(news)}") print(f"Total items: {len(news)}")
async def demo_media_and_links(): async def demo_media_and_links():
"""Extract media and links from a page""" """Extract media and links from a page"""
print("\n=== 8. Media and Links Extraction ===") print("\n=== 8. Media and Links Extraction ===")
@@ -275,17 +272,24 @@ async def demo_media_and_links():
print(f"Found {len(internal_links)} internal links") print(f"Found {len(internal_links)} internal links")
print(f"Found {len(external_links)} external links") print(f"Found {len(external_links)} external links")
# Save everything to files # Print some of the images and links
with open("images.json", "w") as f: for image in images[:3]:
json.dump(images, f, indent=2) print(f"Image: {image['src']}")
for link in internal_links[:3]:
print(f"Internal link: {link['href']}")
for link in external_links[:3]:
print(f"External link: {link['href']}")
with open("links.json", "w") as f: # # Save everything to files
json.dump( # with open("images.json", "w") as f:
{"internal": internal_links, "external": external_links}, # json.dump(images, f, indent=2)
f,
indent=2,
)
# with open("links.json", "w") as f:
# json.dump(
# {"internal": internal_links, "external": external_links},
# f,
# indent=2,
# )
async def demo_screenshot_and_pdf(): async def demo_screenshot_and_pdf():
"""Capture screenshot and PDF of a page""" """Capture screenshot and PDF of a page"""
@@ -299,6 +303,7 @@ async def demo_screenshot_and_pdf():
) )
for i, result in enumerate(result): for i, result in enumerate(result):
# if result.screenshot_data:
if result.screenshot: if result.screenshot:
# Save screenshot # Save screenshot
screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png" screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
@@ -306,6 +311,7 @@ async def demo_screenshot_and_pdf():
f.write(base64.b64decode(result.screenshot)) f.write(base64.b64decode(result.screenshot))
print(f"Screenshot saved to {screenshot_path}") print(f"Screenshot saved to {screenshot_path}")
# if result.pdf_data:
if result.pdf: if result.pdf:
# Save PDF # Save PDF
pdf_path = f"{__cur_dir__}/tmp/example.pdf" pdf_path = f"{__cur_dir__}/tmp/example.pdf"
@@ -313,7 +319,6 @@ async def demo_screenshot_and_pdf():
f.write(result.pdf) f.write(result.pdf)
print(f"PDF saved to {pdf_path}") print(f"PDF saved to {pdf_path}")
async def demo_proxy_rotation(): async def demo_proxy_rotation():
"""Proxy rotation for multiple requests""" """Proxy rotation for multiple requests"""
print("\n=== 10. Proxy Rotation ===") print("\n=== 10. Proxy Rotation ===")
@@ -339,7 +344,6 @@ async def demo_proxy_rotation():
# In a real scenario, these would be run and the proxies would rotate # In a real scenario, these would be run and the proxies would rotate
print("In a real scenario, requests would rotate through the available proxies") print("In a real scenario, requests would rotate through the available proxies")
async def demo_raw_html_and_file(): async def demo_raw_html_and_file():
"""Process raw HTML and local files""" """Process raw HTML and local files"""
print("\n=== 11. Raw HTML and Local Files ===") print("\n=== 11. Raw HTML and Local Files ===")
@@ -376,29 +380,27 @@ async def demo_raw_html_and_file():
os.remove(file_path) os.remove(file_path)
print(f"Processed both raw HTML and local file ({file_path})") print(f"Processed both raw HTML and local file ({file_path})")
async def main(): async def main():
"""Run all demo functions sequentially""" """Run all demo functions sequentially"""
print("=== Comprehensive Crawl4AI Demo ===") print("=== Comprehensive Crawl4AI Demo ===")
print("Note: Some examples require API keys or other configurations") print("Note: Some examples require API keys or other configurations")
# Run all demos # Run all demos
await demo_basic_crawl() # await demo_basic_crawl()
await demo_parallel_crawl() # await demo_parallel_crawl()
await demo_fit_markdown() # await demo_fit_markdown()
await demo_llm_structured_extraction_no_schema() # await demo_llm_structured_extraction_no_schema()
await demo_css_structured_extraction_no_schema() # await demo_css_structured_extraction_no_schema()
await demo_deep_crawl() await demo_deep_crawl()
await demo_js_interaction() # await demo_js_interaction()
await demo_media_and_links() # await demo_media_and_links()
await demo_screenshot_and_pdf() # await demo_screenshot_and_pdf()
# await demo_proxy_rotation() # # await demo_proxy_rotation()
await demo_raw_html_and_file() # await demo_raw_html_and_file()
# Clean up any temp files that may have been created # Clean up any temp files that may have been created
print("\n=== Demo Complete ===") print("\n=== Demo Complete ===")
print("Check for any generated files (screenshots, PDFs) in the current directory") print("Check for any generated files (screenshots, PDFs) in the current directory")
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())