Compare commits
2 Commits
v0.5.0.pos
...
unclecode-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e12d2e29e5 | ||
|
|
fc425023f5 |
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0.post1"
|
__version__ = "0.5.0"
|
||||||
|
|||||||
@@ -712,7 +712,7 @@ def profiles_cmd():
|
|||||||
# Run interactive profile manager
|
# Run interactive profile manager
|
||||||
anyio.run(manage_profiles)
|
anyio.run(manage_profiles)
|
||||||
|
|
||||||
@cli.command(name="")
|
@cli.command()
|
||||||
@click.argument("url", required=False)
|
@click.argument("url", required=False)
|
||||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||||
@@ -772,11 +772,5 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
|||||||
profile=profile
|
profile=profile
|
||||||
)
|
)
|
||||||
|
|
||||||
def main():
|
|
||||||
import sys
|
|
||||||
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
|
|
||||||
sys.argv.insert(1, "crawl")
|
|
||||||
cli()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
cli()
|
||||||
@@ -38,8 +38,8 @@ rate_limiting:
|
|||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
security:
|
security:
|
||||||
enabled: true
|
enabled: false
|
||||||
jwt_enabled: true
|
jwt_enabled: false
|
||||||
https_redirect: false
|
https_redirect: false
|
||||||
trusted_hosts: ["*"]
|
trusted_hosts: ["*"]
|
||||||
headers:
|
headers:
|
||||||
|
|||||||
@@ -65,6 +65,7 @@ async def basic_deep_crawl():
|
|||||||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# 2️⃣ Stream vs. Non-Stream Execution
|
# 2️⃣ Stream vs. Non-Stream Execution
|
||||||
async def stream_vs_nonstream():
|
async def stream_vs_nonstream():
|
||||||
"""
|
"""
|
||||||
@@ -126,6 +127,7 @@ async def stream_vs_nonstream():
|
|||||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||||||
|
|
||||||
|
|
||||||
# 3️⃣ Introduce Filters & Scorers
|
# 3️⃣ Introduce Filters & Scorers
|
||||||
async def filters_and_scorers():
|
async def filters_and_scorers():
|
||||||
"""
|
"""
|
||||||
@@ -234,10 +236,82 @@ async def filters_and_scorers():
|
|||||||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||||||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||||||
|
|
||||||
# 4️⃣ Advanced Filters
|
|
||||||
|
# 4️⃣ Wrap-Up and Key Takeaways
|
||||||
|
async def wrap_up():
|
||||||
|
"""
|
||||||
|
PART 4: Wrap-Up and Key Takeaways
|
||||||
|
|
||||||
|
Summarize the key concepts learned in this tutorial.
|
||||||
|
"""
|
||||||
|
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||||
|
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||||
|
|
||||||
|
# Create a sophisticated filter chain
|
||||||
|
filter_chain = FilterChain(
|
||||||
|
[
|
||||||
|
DomainFilter(
|
||||||
|
allowed_domains=["docs.crawl4ai.com"],
|
||||||
|
blocked_domains=["old.docs.crawl4ai.com"],
|
||||||
|
),
|
||||||
|
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||||
|
ContentTypeFilter(allowed_types=["text/html"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a composite scorer that combines multiple scoring strategies
|
||||||
|
keyword_scorer = KeywordRelevanceScorer(
|
||||||
|
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||||
|
)
|
||||||
|
# Set up the configuration
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||||
|
max_depth=1,
|
||||||
|
include_external=False,
|
||||||
|
filter_chain=filter_chain,
|
||||||
|
url_scorer=keyword_scorer,
|
||||||
|
),
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||||
|
stream=True,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute the crawl
|
||||||
|
results = []
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://docs.crawl4ai.com", config=config
|
||||||
|
):
|
||||||
|
results.append(result)
|
||||||
|
score = result.metadata.get("score", 0)
|
||||||
|
depth = result.metadata.get("depth", 0)
|
||||||
|
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||||
|
|
||||||
|
duration = time.perf_counter() - start_time
|
||||||
|
|
||||||
|
# Summarize the results
|
||||||
|
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||||
|
print(
|
||||||
|
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Group by depth
|
||||||
|
depth_counts = {}
|
||||||
|
for result in results:
|
||||||
|
depth = result.metadata.get("depth", 0)
|
||||||
|
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||||
|
|
||||||
|
print("\n📊 Pages crawled by depth:")
|
||||||
|
for depth, count in sorted(depth_counts.items()):
|
||||||
|
print(f" Depth {depth}: {count} pages")
|
||||||
|
|
||||||
|
|
||||||
|
# 5️⃣ Advanced Filters
|
||||||
async def advanced_filters():
|
async def advanced_filters():
|
||||||
"""
|
"""
|
||||||
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
|
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
|
||||||
|
|
||||||
This function covers:
|
This function covers:
|
||||||
- SEO filters
|
- SEO filters
|
||||||
@@ -297,10 +371,11 @@ async def advanced_filters():
|
|||||||
relevance_score = result.metadata.get("relevance_score", 0)
|
relevance_score = result.metadata.get("relevance_score", 0)
|
||||||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||||||
|
|
||||||
# 5️⃣ Max Pages and Score Thresholds
|
|
||||||
|
# Main function to run the entire tutorial
|
||||||
async def max_pages_and_thresholds():
|
async def max_pages_and_thresholds():
|
||||||
"""
|
"""
|
||||||
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||||
|
|
||||||
This function shows:
|
This function shows:
|
||||||
- How to limit the number of pages crawled
|
- How to limit the number of pages crawled
|
||||||
@@ -396,77 +471,6 @@ async def max_pages_and_thresholds():
|
|||||||
print(f" ✅ Average score: {avg_score:.2f}")
|
print(f" ✅ Average score: {avg_score:.2f}")
|
||||||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||||||
|
|
||||||
# 6️⃣ Wrap-Up and Key Takeaways
|
|
||||||
async def wrap_up():
|
|
||||||
"""
|
|
||||||
PART 6: Wrap-Up and Key Takeaways
|
|
||||||
|
|
||||||
Summarize the key concepts learned in this tutorial.
|
|
||||||
"""
|
|
||||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
|
||||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
|
||||||
|
|
||||||
# Create a sophisticated filter chain
|
|
||||||
filter_chain = FilterChain(
|
|
||||||
[
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["docs.crawl4ai.com"],
|
|
||||||
blocked_domains=["old.docs.crawl4ai.com"],
|
|
||||||
),
|
|
||||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
|
||||||
ContentTypeFilter(allowed_types=["text/html"]),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create a composite scorer that combines multiple scoring strategies
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(
|
|
||||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
|
||||||
)
|
|
||||||
# Set up the configuration
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
|
||||||
max_depth=1,
|
|
||||||
include_external=False,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=keyword_scorer,
|
|
||||||
),
|
|
||||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
|
||||||
stream=True,
|
|
||||||
verbose=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute the crawl
|
|
||||||
results = []
|
|
||||||
start_time = time.perf_counter()
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
async for result in await crawler.arun(
|
|
||||||
url="https://docs.crawl4ai.com", config=config
|
|
||||||
):
|
|
||||||
results.append(result)
|
|
||||||
score = result.metadata.get("score", 0)
|
|
||||||
depth = result.metadata.get("depth", 0)
|
|
||||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
|
||||||
|
|
||||||
duration = time.perf_counter() - start_time
|
|
||||||
|
|
||||||
# Summarize the results
|
|
||||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
|
||||||
print(
|
|
||||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Group by depth
|
|
||||||
depth_counts = {}
|
|
||||||
for result in results:
|
|
||||||
depth = result.metadata.get("depth", 0)
|
|
||||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
|
||||||
|
|
||||||
print("\n📊 Pages crawled by depth:")
|
|
||||||
for depth, count in sorted(depth_counts.items()):
|
|
||||||
print(f" Depth {depth}: {count} pages")
|
|
||||||
|
|
||||||
|
|
||||||
async def run_tutorial():
|
async def run_tutorial():
|
||||||
"""
|
"""
|
||||||
Executes all tutorial sections in sequence.
|
Executes all tutorial sections in sequence.
|
||||||
@@ -478,12 +482,12 @@ async def run_tutorial():
|
|||||||
|
|
||||||
# Define sections - uncomment to run specific parts during development
|
# Define sections - uncomment to run specific parts during development
|
||||||
tutorial_sections = [
|
tutorial_sections = [
|
||||||
basic_deep_crawl,
|
# basic_deep_crawl,
|
||||||
stream_vs_nonstream,
|
# stream_vs_nonstream,
|
||||||
filters_and_scorers,
|
# filters_and_scorers,
|
||||||
max_pages_and_thresholds,
|
max_pages_and_thresholds, # Added new section
|
||||||
advanced_filters,
|
|
||||||
wrap_up,
|
wrap_up,
|
||||||
|
advanced_filters,
|
||||||
]
|
]
|
||||||
|
|
||||||
for section in tutorial_sections:
|
for section in tutorial_sections:
|
||||||
@@ -493,6 +497,7 @@ async def run_tutorial():
|
|||||||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||||||
print("For more information, check out https://docs.crawl4ai.com")
|
print("For more information, check out https://docs.crawl4ai.com")
|
||||||
|
|
||||||
|
|
||||||
# Execute the tutorial when run directly
|
# Execute the tutorial when run directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(run_tutorial())
|
asyncio.run(run_tutorial())
|
||||||
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
|
|||||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||||
crawl4ai-setup = "crawl4ai.install:post_install"
|
crawl4ai-setup = "crawl4ai.install:post_install"
|
||||||
crawl4ai-doctor = "crawl4ai.install:doctor"
|
crawl4ai-doctor = "crawl4ai.install:doctor"
|
||||||
crwl = "crawl4ai.cli:main"
|
crwl = "crawl4ai.cli:cli"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
||||||
|
|||||||
Reference in New Issue
Block a user