Release prep (#749)
* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
This commit is contained in:
@@ -52,7 +52,7 @@ async def crawl_sequential(urls: List[str]):
|
||||
)
|
||||
if result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
@@ -101,7 +101,7 @@ async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
||||
print(f"Error crawling {url}: {str(result)}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
404
docs/examples/deepcrawl.py
Normal file
404
docs/examples/deepcrawl.py
Normal file
@@ -0,0 +1,404 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
ContentRelevanceFilter,
|
||||
SEOFilter,
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import (
|
||||
KeywordRelevanceScorer,
|
||||
)
|
||||
|
||||
|
||||
# 1️⃣ Basic Deep Crawl Setup
|
||||
async def basic_deep_crawl():
|
||||
"""
|
||||
PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
|
||||
|
||||
This function shows:
|
||||
- How to set up BFSDeepCrawlStrategy (Breadth-First Search)
|
||||
- Setting depth and domain parameters
|
||||
- Processing the results to show the hierarchy
|
||||
"""
|
||||
print("\n===== BASIC DEEP CRAWL SETUP =====")
|
||||
|
||||
# Configure a 2-level deep crawl using Breadth-First Search strategy
|
||||
# max_depth=2 means: initial page (depth 0) + 2 more levels
|
||||
# include_external=False means: only follow links within the same domain
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True, # Show progress during crawling
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
# Group results by depth to visualize the crawl tree
|
||||
pages_by_depth = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
if depth not in pages_by_depth:
|
||||
pages_by_depth[depth] = []
|
||||
pages_by_depth[depth].append(result.url)
|
||||
|
||||
print(f"✅ Crawled {len(results)} pages total")
|
||||
|
||||
# Display crawl structure by depth
|
||||
for depth, urls in sorted(pages_by_depth.items()):
|
||||
print(f"\nDepth {depth}: {len(urls)} pages")
|
||||
# Show first 3 URLs for each depth as examples
|
||||
for url in urls[:3]:
|
||||
print(f" → {url}")
|
||||
if len(urls) > 3:
|
||||
print(f" ... and {len(urls) - 3} more")
|
||||
|
||||
print(
|
||||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
# 2️⃣ Stream vs. Non-Stream Execution
|
||||
async def stream_vs_nonstream():
|
||||
"""
|
||||
PART 2: Demonstrates the difference between stream and non-stream execution.
|
||||
|
||||
Non-stream: Waits for all results before processing
|
||||
Stream: Processes results as they become available
|
||||
"""
|
||||
print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
|
||||
|
||||
# Common configuration for both examples
|
||||
base_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# NON-STREAMING MODE
|
||||
print("\n📊 NON-STREAMING MODE:")
|
||||
print(" In this mode, all results are collected before being returned.")
|
||||
|
||||
non_stream_config = base_config.clone()
|
||||
non_stream_config.stream = False
|
||||
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=non_stream_config
|
||||
)
|
||||
|
||||
print(f" ✅ Received all {len(results)} results at once")
|
||||
print(f" ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
# STREAMING MODE
|
||||
print("\n📊 STREAMING MODE:")
|
||||
print(" In this mode, results are processed as they become available.")
|
||||
|
||||
stream_config = base_config.clone()
|
||||
stream_config.stream = True
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result_count = 0
|
||||
first_result_time = None
|
||||
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=stream_config
|
||||
):
|
||||
result_count += 1
|
||||
if result_count == 1:
|
||||
first_result_time = time.perf_counter() - start_time
|
||||
print(
|
||||
f" ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
|
||||
)
|
||||
elif result_count % 5 == 0: # Show every 5th result for brevity
|
||||
print(f" → Result #{result_count}: {result.url}")
|
||||
|
||||
print(f" ✅ Total: {result_count} results")
|
||||
print(f" ✅ First result: {first_result_time:.2f} seconds")
|
||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||||
|
||||
|
||||
# 3️⃣ Introduce Filters & Scorers
|
||||
async def filters_and_scorers():
|
||||
"""
|
||||
PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
|
||||
|
||||
This function progressively adds:
|
||||
1. A single URL pattern filter
|
||||
2. Multiple filters in a chain
|
||||
3. Scorers for prioritizing pages
|
||||
"""
|
||||
print("\n===== FILTERS AND SCORERS =====")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# SINGLE FILTER EXAMPLE
|
||||
print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
|
||||
print(" Only crawl pages containing 'core' in the URL")
|
||||
|
||||
# Create a filter that only allows URLs with 'guide' in them
|
||||
url_filter = URLPatternFilter(patterns=["*core*"])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=FilterChain([url_filter]), # Single filter
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages matching '*core*'")
|
||||
for result in results[:3]: # Show first 3 results
|
||||
print(f" → {result.url}")
|
||||
if len(results) > 3:
|
||||
print(f" ... and {len(results) - 3} more")
|
||||
|
||||
# MULTIPLE FILTERS EXAMPLE
|
||||
print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
|
||||
print(" Only crawl pages that:")
|
||||
print(" 1. Contain '2024' in the URL")
|
||||
print(" 2. Are from 'techcrunch.com'")
|
||||
print(" 3. Are of text/html or application/javascript content type")
|
||||
|
||||
# Create a chain of filters
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
URLPatternFilter(patterns=["*2024*"]),
|
||||
DomainFilter(
|
||||
allowed_domains=["techcrunch.com"],
|
||||
blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
|
||||
),
|
||||
ContentTypeFilter(
|
||||
allowed_types=["text/html", "application/javascript"]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, include_external=False, filter_chain=filter_chain
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://techcrunch.com", config=config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages after applying all filters")
|
||||
for result in results[:3]:
|
||||
print(f" → {result.url}")
|
||||
if len(results) > 3:
|
||||
print(f" ... and {len(results) - 3} more")
|
||||
|
||||
# SCORERS EXAMPLE
|
||||
print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
|
||||
print(
|
||||
"Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
|
||||
)
|
||||
|
||||
# Create a keyword relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
|
||||
max_depth=1, include_external=False, url_scorer=keyword_scorer
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=True,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score")
|
||||
print(f" → Score: {score:.2f} | {result.url}")
|
||||
|
||||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||||
|
||||
|
||||
# 4️⃣ Wrap-Up and Key Takeaways
|
||||
async def wrap_up():
|
||||
"""
|
||||
PART 4: Wrap-Up and Key Takeaways
|
||||
|
||||
Summarize the key concepts learned in this tutorial.
|
||||
"""
|
||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||
|
||||
# Create a sophisticated filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a composite scorer that combines multiple scoring strategies
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
# Set up the configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
results = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Summarize the results
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
print(
|
||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||
)
|
||||
|
||||
# Group by depth
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
# 5️⃣ Advanced Filters
|
||||
async def advanced_filters():
|
||||
"""
|
||||
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
|
||||
|
||||
This function covers:
|
||||
- SEO filters
|
||||
- Text relevancy filtering
|
||||
- Combining advanced filters
|
||||
"""
|
||||
print("\n===== ADVANCED FILTERS =====")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# SEO FILTER EXAMPLE
|
||||
print("\n📊 EXAMPLE 1: SEO FILTERS")
|
||||
print(
|
||||
"Quantitative SEO quality assessment filter based searching keywords in the head section"
|
||||
)
|
||||
|
||||
seo_filter = SEOFilter(
|
||||
threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, filter_chain=FilterChain([seo_filter])
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Found {len(results)} pages with relevant keywords")
|
||||
for result in results:
|
||||
print(f" → {result.url}")
|
||||
|
||||
# ADVANCED TEXT RELEVANCY FILTER
|
||||
print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
|
||||
|
||||
# More sophisticated content relevance filter
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="Interact with the web using your authentic digital identity",
|
||||
threshold=0.7,
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, filter_chain=FilterChain([relevance_filter])
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Found {len(results)} pages")
|
||||
for result in results:
|
||||
relevance_score = result.metadata.get("relevance_score", 0)
|
||||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||||
|
||||
|
||||
# Main function to run the entire tutorial
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Executes all tutorial sections in sequence.
|
||||
"""
|
||||
print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
|
||||
print("======================================")
|
||||
print("This tutorial will walk you through deep crawling techniques,")
|
||||
print("from basic to advanced, using the Crawl4AI library.")
|
||||
|
||||
# Define sections - uncomment to run specific parts during development
|
||||
tutorial_sections = [
|
||||
basic_deep_crawl,
|
||||
stream_vs_nonstream,
|
||||
filters_and_scorers,
|
||||
wrap_up,
|
||||
advanced_filters,
|
||||
]
|
||||
|
||||
for section in tutorial_sections:
|
||||
await section()
|
||||
|
||||
print("\n🎉 TUTORIAL COMPLETE! 🎉")
|
||||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||||
print("For more information, check out https://docs.crawl4ai.com")
|
||||
|
||||
|
||||
# Execute the tutorial when run directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tutorial())
|
||||
@@ -39,9 +39,9 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
|
||||
if result.success:
|
||||
print(f"\n=== {name} Results ===")
|
||||
print(f"Extracted Content: {result.extracted_content}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
|
||||
print(
|
||||
f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
|
||||
f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
|
||||
)
|
||||
else:
|
||||
print(f"Error in {name}: Crawl failed")
|
||||
|
||||
@@ -25,7 +25,7 @@ async def main():
|
||||
# url="https://www.helloworld.org", config=crawler_config
|
||||
url="https://www.kidocode.com", config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
# print(result.model_dump())
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"id": "003376f3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -114,7 +114,7 @@
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" bypass_cache=True # By default this is False, meaning the cache will be used\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Print the first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print the first 500 characters\n",
|
||||
" \n",
|
||||
"asyncio.run(simple_crawl())"
|
||||
]
|
||||
@@ -129,7 +129,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"id": "5bb8c1e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -177,7 +177,7 @@
|
||||
" # wait_for=wait_for,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Print first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print first 500 characters\n",
|
||||
"\n",
|
||||
"asyncio.run(crawl_dynamic_content())"
|
||||
]
|
||||
@@ -206,11 +206,11 @@
|
||||
" word_count_threshold=10,\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" full_markdown_length = len(result.markdown)\n",
|
||||
" fit_markdown_length = len(result.fit_markdown)\n",
|
||||
" full_markdown_length = len(result.markdown.raw_markdown)\n",
|
||||
" fit_markdown_length = len(result.markdown.fit_markdown)\n",
|
||||
" print(f\"Full Markdown Length: {full_markdown_length}\")\n",
|
||||
" print(f\"Fit Markdown Length: {fit_markdown_length}\")\n",
|
||||
" print(result.fit_markdown[:1000])\n",
|
||||
" print(result.markdown.fit_markdown[:1000])\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"asyncio.run(clean_content())"
|
||||
@@ -342,7 +342,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": null,
|
||||
"id": "bc4d2fc8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -387,7 +387,7 @@
|
||||
" url=\"https://crawl4ai.com\",\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Display the first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Display the first 500 characters\n",
|
||||
"\n",
|
||||
"asyncio.run(custom_hook_workflow())"
|
||||
]
|
||||
@@ -465,7 +465,7 @@
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(f\"Page {page_number} Content:\")\n",
|
||||
" print(result.markdown[:500]) # Print first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print first 500 characters\n",
|
||||
"\n",
|
||||
"# asyncio.run(multi_page_session_crawl())"
|
||||
]
|
||||
|
||||
@@ -59,8 +59,8 @@ async def clean_content():
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
config=crawler_config,
|
||||
)
|
||||
full_markdown_length = len(result.markdown_v2.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown_v2.fit_markdown)
|
||||
full_markdown_length = len(result.markdown.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown.fit_markdown)
|
||||
print(f"Full Markdown Length: {full_markdown_length}")
|
||||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||||
|
||||
@@ -139,7 +139,7 @@ async def custom_hook_workflow(verbose=True):
|
||||
|
||||
# Perform the crawl operation
|
||||
result = await crawler.arun(url="https://crawl4ai.com")
|
||||
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
|
||||
print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
|
||||
|
||||
|
||||
# Proxy Example
|
||||
@@ -584,9 +584,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
|
||||
|
||||
@@ -514,9 +514,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI with JavaScript execution
|
||||
@@ -539,9 +539,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (with JavaScript execution):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
|
||||
print("\nNote on Speed Comparison:")
|
||||
print("The speed test conducted here may not reflect optimal conditions.")
|
||||
@@ -613,9 +613,9 @@ async def fit_markdown_remove_overlay():
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(len(result.markdown_v2.raw_markdown))
|
||||
print(len(result.markdown_v2.markdown_with_citations))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
print(len(result.markdown.raw_markdown))
|
||||
print(len(result.markdown.markdown_with_citations))
|
||||
print(len(result.markdown.fit_markdown))
|
||||
|
||||
# Save clean html
|
||||
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
|
||||
@@ -624,18 +624,18 @@ async def fit_markdown_remove_overlay():
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_raw_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
f.write(result.markdown.raw_markdown)
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_markdown_with_citations.md"),
|
||||
"w",
|
||||
) as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
f.write(result.markdown.markdown_with_citations)
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_fit_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
f.write(result.markdown.fit_markdown)
|
||||
|
||||
print("Done")
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ async def little_hello_web():
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org"
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
|
||||
async def hello_web():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
@@ -42,7 +42,7 @@ async def hello_web():
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown[:500])
|
||||
print(result.markdown.fit_markdown[:500])
|
||||
|
||||
# Naive Approach Using Large Language Models
|
||||
async def extract_using_llm():
|
||||
|
||||
460
docs/examples/tutorial_v0.5.py
Normal file
460
docs/examples/tutorial_v0.5.py
Normal file
@@ -0,0 +1,460 @@
|
||||
import asyncio
|
||||
import time
|
||||
import re
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import (
|
||||
BestFirstCrawlingStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
# 1️⃣ Deep Crawling with Best-First Strategy
|
||||
async def deep_crawl():
|
||||
"""
|
||||
PART 1: Deep Crawling with Best-First Strategy
|
||||
|
||||
This function demonstrates:
|
||||
- Using the BestFirstCrawlingStrategy
|
||||
- Creating filter chains to narrow down crawl targets
|
||||
- Using a scorer to prioritize certain URLs
|
||||
- Respecting robots.txt rules
|
||||
"""
|
||||
print("\n===== DEEP CRAWLING =====")
|
||||
print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
|
||||
|
||||
# Create a filter chain to filter urls based on patterns, domains and content type
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*"],),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a keyword scorer that prioritises the pages with certain keywords first
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
|
||||
# Set up the configuration with robots.txt compliance enabled
|
||||
deep_crawl_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
check_robots_txt=True, # Enable robots.txt compliance
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
print("\n📊 Starting deep crawl with Best-First strategy...")
|
||||
print(" - Filtering by domain, URL patterns, and content type")
|
||||
print(" - Scoring pages based on keyword relevance")
|
||||
print(" - Respecting robots.txt rules")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
results = []
|
||||
|
||||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
|
||||
# Print each result as it comes in
|
||||
depth = result.metadata.get("depth", 0)
|
||||
score = result.metadata.get("score", 0)
|
||||
print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
|
||||
results.append(result)
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Print summary statistics
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
|
||||
# Group by depth
|
||||
if results:
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
# 2️⃣ Memory-Adaptive Dispatcher
|
||||
async def memory_adaptive_dispatcher():
|
||||
"""
|
||||
PART 2: Memory-Adaptive Dispatcher
|
||||
|
||||
This function demonstrates:
|
||||
- Using MemoryAdaptiveDispatcher to manage system memory
|
||||
- Batch and streaming modes with multiple URLs
|
||||
"""
|
||||
print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
|
||||
print("This example shows how to use the memory-adaptive dispatcher for resource management.")
|
||||
|
||||
# Configure the dispatcher (optional, defaults are used if not provided)
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
|
||||
check_interval=0.5, # Check memory every 0.5 seconds
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"https://docs.crawl4ai.com",
|
||||
"https://github.com/unclecode/crawl4ai"
|
||||
]
|
||||
|
||||
async def batch_mode():
|
||||
print("\n📊 BATCH MODE:")
|
||||
print(" In this mode, all results are collected before being returned.")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=False), # Batch mode
|
||||
dispatcher=dispatcher,
|
||||
)
|
||||
|
||||
print(f" ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
|
||||
for result in results:
|
||||
print(f" → {result.url} with status code: {result.status_code}")
|
||||
|
||||
async def stream_mode():
|
||||
print("\n📊 STREAMING MODE:")
|
||||
print(" In this mode, results are processed as they become available.")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
count = 0
|
||||
first_result_time = None
|
||||
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=True), # Stream mode
|
||||
dispatcher=dispatcher,
|
||||
):
|
||||
count += 1
|
||||
current_time = time.perf_counter() - start_time
|
||||
|
||||
if count == 1:
|
||||
first_result_time = current_time
|
||||
print(f" ✅ First result after {first_result_time:.2f} seconds: {result.url}")
|
||||
else:
|
||||
print(f" → Result #{count} after {current_time:.2f} seconds: {result.url}")
|
||||
|
||||
print(f" ✅ Total: {count} results")
|
||||
print(f" ✅ First result: {first_result_time:.2f} seconds")
|
||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
# Run both examples
|
||||
await batch_mode()
|
||||
await stream_mode()
|
||||
|
||||
print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
|
||||
print(" and manages concurrency based on system resources.")
|
||||
|
||||
|
||||
# 3️⃣ HTTP Crawler Strategy
|
||||
async def http_crawler_strategy():
|
||||
"""
|
||||
PART 3: HTTP Crawler Strategy
|
||||
|
||||
This function demonstrates:
|
||||
- Using the lightweight HTTP-only crawler
|
||||
- Setting custom headers and configurations
|
||||
"""
|
||||
print("\n===== HTTP CRAWLER STRATEGY =====")
|
||||
print("This example shows how to use the fast, lightweight HTTP-only crawler.")
|
||||
|
||||
# Use the HTTP crawler strategy
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
headers={"User-Agent": "MyCustomBot/1.0"},
|
||||
follow_redirects=True,
|
||||
verify_ssl=True
|
||||
)
|
||||
|
||||
print("\n📊 Initializing HTTP crawler strategy...")
|
||||
print(" - Using custom User-Agent: MyCustomBot/1.0")
|
||||
print(" - Following redirects: Enabled")
|
||||
print(" - Verifying SSL: Enabled")
|
||||
|
||||
# Create crawler with HTTP strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
|
||||
) as crawler:
|
||||
start_time = time.perf_counter()
|
||||
result = await crawler.arun("https://example.com")
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
print(f"\n✅ Crawled in {duration:.2f} seconds")
|
||||
print(f"✅ Status code: {result.status_code}")
|
||||
print(f"✅ Content length: {len(result.html)} bytes")
|
||||
|
||||
# Check if there was a redirect
|
||||
if result.redirected_url and result.redirected_url != result.url:
|
||||
print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
|
||||
|
||||
print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
|
||||
print(" than browser-based crawling for simple pages.")
|
||||
|
||||
|
||||
# 4️⃣ Proxy Rotation
|
||||
async def proxy_rotation():
|
||||
"""
|
||||
PART 4: Proxy Rotation
|
||||
|
||||
This function demonstrates:
|
||||
- Setting up a proxy rotation strategy
|
||||
- Using multiple proxies in a round-robin fashion
|
||||
"""
|
||||
print("\n===== PROXY ROTATION =====")
|
||||
print("This example shows how to implement proxy rotation for distributed crawling.")
|
||||
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config
|
||||
)
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy.ip
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
else:
|
||||
print(f"❌ Crawl via proxy failed!: {result.error_message}")
|
||||
|
||||
|
||||
# 5️⃣ LLM Content Filter (requires API key)
|
||||
async def llm_content_filter():
|
||||
"""
|
||||
PART 5: LLM Content Filter
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Using LLM to generate focused markdown
|
||||
- LlmConfig for configuration
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
print("\n===== LLM CONTENT FILTER =====")
|
||||
print("This example shows how to use LLM to generate focused markdown content.")
|
||||
print("Note: This example requires an API key. Set it in environment variables.")
|
||||
|
||||
# Create LLM configuration
|
||||
# Replace with your actual API key or set as environment variable
|
||||
llm_config = LlmConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
||||
)
|
||||
|
||||
print("\n📊 Setting up LLM content filter...")
|
||||
print(f" - Provider: {llm_config.provider}")
|
||||
print(" - API token: Using environment variable")
|
||||
print(" - Instruction: Extract key concepts and summaries")
|
||||
|
||||
# Create markdown generator with LLM filter
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig=llm_config,
|
||||
instruction="Extract key concepts and summaries"
|
||||
)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.crawl4ai.com", config=config)
|
||||
pprint(result.markdown.fit_markdown)
|
||||
print("\n✅ Generated focused markdown:")
|
||||
|
||||
|
||||
|
||||
# 6️⃣ PDF Processing
|
||||
async def pdf_processing():
|
||||
"""
|
||||
PART 6: PDF Processing
|
||||
|
||||
This function demonstrates:
|
||||
- Using PDFCrawlerStrategy and PDFContentScrapingStrategy
|
||||
- Extracting text and metadata from PDFs
|
||||
"""
|
||||
print("\n===== PDF PROCESSING =====")
|
||||
print("This example shows how to extract text and metadata from PDF files.")
|
||||
|
||||
# Sample PDF URL
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
|
||||
|
||||
print("\n📊 Initializing PDF crawler...")
|
||||
print(f" - Target PDF: {pdf_url}")
|
||||
print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
|
||||
|
||||
# Create crawler with PDF strategy
|
||||
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
|
||||
print("\n🚀 Starting PDF processing...")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result = await crawler.arun(
|
||||
pdf_url,
|
||||
config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
|
||||
)
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
print(f"\n✅ Processed PDF in {duration:.2f} seconds")
|
||||
|
||||
# Show metadata
|
||||
print("\n📄 PDF Metadata:")
|
||||
if result.metadata:
|
||||
for key, value in result.metadata.items():
|
||||
if key not in ["html", "text", "markdown"] and value:
|
||||
print(f" - {key}: {value}")
|
||||
else:
|
||||
print(" No metadata available")
|
||||
|
||||
# Show sample of content
|
||||
if result.markdown:
|
||||
print("\n📝 PDF Content Sample:")
|
||||
content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
|
||||
print(f"---\n{content_sample}\n---")
|
||||
else:
|
||||
print("\n⚠️ No content extracted")
|
||||
|
||||
print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
|
||||
print(" to extract both text content and metadata.")
|
||||
|
||||
|
||||
# 7️⃣ LLM Schema Generation (requires API key)
|
||||
async def llm_schema_generation():
|
||||
"""
|
||||
PART 7: LLM Schema Generation
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Using LLM to generate extraction schemas
|
||||
- JsonCssExtractionStrategy
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
print("\n===== LLM SCHEMA GENERATION =====")
|
||||
print("This example shows how to use LLM to automatically generate extraction schemas.")
|
||||
print("Note: This example requires an API key. Set it in environment variables.")
|
||||
|
||||
# Sample HTML
|
||||
sample_html = """
|
||||
<div class="product">
|
||||
<h2 class="title">Awesome Gaming Laptop</h2>
|
||||
<div class="price">$1,299.99</div>
|
||||
<div class="specs">
|
||||
<ul>
|
||||
<li>16GB RAM</li>
|
||||
<li>512GB SSD</li>
|
||||
<li>RTX 3080</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="rating">4.7/5</div>
|
||||
</div>
|
||||
"""
|
||||
print("\n📊 Setting up LlmConfig...")
|
||||
# Create LLM configuration
|
||||
llm_config = LlmConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY"
|
||||
)
|
||||
print("\n🚀 Generating schema for product extraction...")
|
||||
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
llmConfig = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print("\n✅ Generated Schema:")
|
||||
pprint(schema)
|
||||
|
||||
# Run all sections
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Main function to run all tutorial sections.
|
||||
"""
|
||||
print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
|
||||
print("===============================")
|
||||
print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
|
||||
print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
|
||||
print("and more powerful extraction capabilities.")
|
||||
|
||||
# Sections to run
|
||||
sections = [
|
||||
deep_crawl, # 1. Deep Crawling with Best-First Strategy
|
||||
memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
|
||||
http_crawler_strategy, # 3. HTTP Crawler Strategy
|
||||
proxy_rotation, # 4. Proxy Rotation
|
||||
llm_content_filter, # 5. LLM Content Filter
|
||||
pdf_processing, # 6. PDF Processing
|
||||
llm_schema_generation, # 7. Schema Generation using LLM
|
||||
]
|
||||
|
||||
for section in sections:
|
||||
try:
|
||||
await section()
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error in {section.__name__}: {e}")
|
||||
|
||||
print("\n🎉 TUTORIAL COMPLETE! 🎉")
|
||||
print("You've now explored the key features of Crawl4AI v0.5.0")
|
||||
print("For more information, visit https://docs.crawl4ai.com")
|
||||
|
||||
|
||||
# Run the tutorial
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tutorial())
|
||||
Reference in New Issue
Block a user