Merge branch 'develop' of https://github.com/unclecode/crawl4ai into develop

This commit is contained in:
ntohidi
2025-08-20 16:56:19 +08:00
3 changed files with 37 additions and 35 deletions

View File

@@ -373,7 +373,7 @@ async def main():
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://docs.micronaut.io/4.7.6/guide/",
url="https://docs.micronaut.io/4.9.9/guide/",
config=run_config
)
print(len(result.markdown.raw_markdown))
@@ -425,7 +425,7 @@ async def main():
"type": "attribute",
"attribute": "src"
}
}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

View File

@@ -4,7 +4,7 @@ import asyncio
from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
from datetime import datetime
from datetime import datetime, timezone
from base64 import b64encode
import logging
@@ -576,7 +576,7 @@ async def handle_crawl_job(
task_id = f"crawl_{uuid4().hex[:8]}"
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
"created_at": datetime.utcnow().isoformat(),
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
"url": json.dumps(urls), # store list as JSON string
"result": "",
"error": "",

View File

@@ -102,16 +102,16 @@ async def smart_blog_crawler():
# Step 2: Configure discovery - let's find all blog posts
config = SeedingConfig(
source="sitemap", # Use the website's sitemap
pattern="*/blog/*.html", # Only blog posts
source="sitemap+cc", # Use the website's sitemap+cc
pattern="*/courses/*", # Only courses related posts
extract_head=True, # Get page metadata
max_urls=100 # Limit for this example
)
# Step 3: Discover URLs from the Python blog
print("🔍 Discovering blog posts...")
print("🔍 Discovering course posts...")
urls = await seeder.urls("realpython.com", config)
print(f"✅ Found {len(urls)} blog posts")
print(f"✅ Found {len(urls)} course posts")
# Step 4: Filter for Python tutorials (using metadata!)
tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
only_text=True,
word_count_threshold=300 # Only substantial articles
word_count_threshold=300, # Only substantial articles
stream=True
)
# Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
**What just happened?**
1. We discovered all blog URLs from the sitemap
1. We discovered all blog URLs from the sitemap+cc
2. We filtered using metadata (no crawling needed!)
3. We crawled only the relevant tutorials
4. We saved tons of time and bandwidth
@@ -282,7 +283,7 @@ config = SeedingConfig(
live_check=True, # Verify each URL is accessible
concurrency=20 # Check 20 URLs in parallel
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Now you can filter by status
@@ -311,7 +312,7 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
config = SeedingConfig(
extract_head=True # Extract metadata from <head> section
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Now each URL has rich metadata
@@ -387,7 +388,7 @@ config = SeedingConfig(
scoring_method="bm25",
score_threshold=0.3
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# URLs are scored based on:
@@ -429,7 +430,7 @@ config = SeedingConfig(
extract_head=True,
live_check=True
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("blog.example.com", config)
# Analyze the results
@@ -488,7 +489,7 @@ config = SeedingConfig(
scoring_method="bm25", # Use BM25 algorithm
score_threshold=0.3 # Minimum relevance score
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("realpython.com", config)
# Results are automatically sorted by relevance!
@@ -511,7 +512,7 @@ config = SeedingConfig(
score_threshold=0.5,
max_urls=20
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("docs.example.com", config)
# The highest scoring URLs will be API docs!
@@ -529,7 +530,7 @@ config = SeedingConfig(
score_threshold=0.4,
pattern="*/product/*" # Combine with pattern matching
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("shop.example.com", config)
# Filter further by price (from metadata)
@@ -550,7 +551,7 @@ config = SeedingConfig(
scoring_method="bm25",
score_threshold=0.35
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("technews.com", config)
# Filter by date
@@ -591,7 +592,7 @@ for query in queries:
score_threshold=0.4,
max_urls=10 # Top 10 per topic
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("learning-platform.com", config)
all_tutorials.extend(urls)
@@ -625,6 +626,7 @@ config = SeedingConfig(
)
# Returns a dictionary: {domain: [urls]}
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(domains, config)
# Process results
@@ -654,7 +656,7 @@ config = SeedingConfig(
pattern="*/blog/*",
max_urls=100
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(competitors, config)
# Analyze content types
@@ -690,7 +692,7 @@ config = SeedingConfig(
score_threshold=0.3,
max_urls=20 # Per site
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(educational_sites, config)
# Find the best beginner tutorials
@@ -731,7 +733,7 @@ config = SeedingConfig(
score_threshold=0.5, # High threshold for relevance
max_urls=10
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(news_sites, config)
# Collect all mentions