Merge branch 'develop' of https://github.com/unclecode/crawl4ai into develop
This commit is contained in:
@@ -373,7 +373,7 @@ async def main():
|
|||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://docs.micronaut.io/4.7.6/guide/",
|
url="https://docs.micronaut.io/4.9.9/guide/",
|
||||||
config=run_config
|
config=run_config
|
||||||
)
|
)
|
||||||
print(len(result.markdown.raw_markdown))
|
print(len(result.markdown.raw_markdown))
|
||||||
@@ -425,7 +425,7 @@ async def main():
|
|||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "src"
|
"attribute": "src"
|
||||||
}
|
}
|
||||||
}
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import asyncio
|
|||||||
from typing import List, Tuple, Dict
|
from typing import List, Tuple, Dict
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -576,7 +576,7 @@ async def handle_crawl_job(
|
|||||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||||
"created_at": datetime.utcnow().isoformat(),
|
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
||||||
"url": json.dumps(urls), # store list as JSON string
|
"url": json.dumps(urls), # store list as JSON string
|
||||||
"result": "",
|
"result": "",
|
||||||
"error": "",
|
"error": "",
|
||||||
|
|||||||
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
|
|||||||
|
|
||||||
# Step 2: Configure discovery - let's find all blog posts
|
# Step 2: Configure discovery - let's find all blog posts
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="sitemap", # Use the website's sitemap
|
source="sitemap+cc", # Use the website's sitemap+cc
|
||||||
pattern="*/blog/*.html", # Only blog posts
|
pattern="*/courses/*", # Only courses related posts
|
||||||
extract_head=True, # Get page metadata
|
extract_head=True, # Get page metadata
|
||||||
max_urls=100 # Limit for this example
|
max_urls=100 # Limit for this example
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 3: Discover URLs from the Python blog
|
# Step 3: Discover URLs from the Python blog
|
||||||
print("🔍 Discovering blog posts...")
|
print("🔍 Discovering course posts...")
|
||||||
urls = await seeder.urls("realpython.com", config)
|
urls = await seeder.urls("realpython.com", config)
|
||||||
print(f"✅ Found {len(urls)} blog posts")
|
print(f"✅ Found {len(urls)} course posts")
|
||||||
|
|
||||||
# Step 4: Filter for Python tutorials (using metadata!)
|
# Step 4: Filter for Python tutorials (using metadata!)
|
||||||
tutorials = [
|
tutorials = [
|
||||||
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
only_text=True,
|
only_text=True,
|
||||||
word_count_threshold=300 # Only substantial articles
|
word_count_threshold=300, # Only substantial articles
|
||||||
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract URLs and crawl them
|
# Extract URLs and crawl them
|
||||||
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
|
|||||||
|
|
||||||
**What just happened?**
|
**What just happened?**
|
||||||
|
|
||||||
1. We discovered all blog URLs from the sitemap
|
1. We discovered all blog URLs from the sitemap+cc
|
||||||
2. We filtered using metadata (no crawling needed!)
|
2. We filtered using metadata (no crawling needed!)
|
||||||
3. We crawled only the relevant tutorials
|
3. We crawled only the relevant tutorials
|
||||||
4. We saved tons of time and bandwidth
|
4. We saved tons of time and bandwidth
|
||||||
@@ -282,8 +283,8 @@ config = SeedingConfig(
|
|||||||
live_check=True, # Verify each URL is accessible
|
live_check=True, # Verify each URL is accessible
|
||||||
concurrency=20 # Check 20 URLs in parallel
|
concurrency=20 # Check 20 URLs in parallel
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# Now you can filter by status
|
# Now you can filter by status
|
||||||
live_urls = [u for u in urls if u["status"] == "valid"]
|
live_urls = [u for u in urls if u["status"] == "valid"]
|
||||||
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
|
|||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
extract_head=True # Extract metadata from <head> section
|
extract_head=True # Extract metadata from <head> section
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# Now each URL has rich metadata
|
# Now each URL has rich metadata
|
||||||
for url in urls[:3]:
|
for url in urls[:3]:
|
||||||
@@ -387,8 +388,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.3
|
score_threshold=0.3
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# URLs are scored based on:
|
# URLs are scored based on:
|
||||||
# 1. Domain parts matching (e.g., 'python' in python.example.com)
|
# 1. Domain parts matching (e.g., 'python' in python.example.com)
|
||||||
@@ -429,8 +430,8 @@ config = SeedingConfig(
|
|||||||
extract_head=True,
|
extract_head=True,
|
||||||
live_check=True
|
live_check=True
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("blog.example.com", config)
|
urls = await seeder.urls("blog.example.com", config)
|
||||||
|
|
||||||
# Analyze the results
|
# Analyze the results
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
@@ -488,8 +489,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25", # Use BM25 algorithm
|
scoring_method="bm25", # Use BM25 algorithm
|
||||||
score_threshold=0.3 # Minimum relevance score
|
score_threshold=0.3 # Minimum relevance score
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("realpython.com", config)
|
urls = await seeder.urls("realpython.com", config)
|
||||||
|
|
||||||
# Results are automatically sorted by relevance!
|
# Results are automatically sorted by relevance!
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
@@ -511,8 +512,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.5,
|
score_threshold=0.5,
|
||||||
max_urls=20
|
max_urls=20
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("docs.example.com", config)
|
urls = await seeder.urls("docs.example.com", config)
|
||||||
|
|
||||||
# The highest scoring URLs will be API docs!
|
# The highest scoring URLs will be API docs!
|
||||||
```
|
```
|
||||||
@@ -529,8 +530,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.4,
|
score_threshold=0.4,
|
||||||
pattern="*/product/*" # Combine with pattern matching
|
pattern="*/product/*" # Combine with pattern matching
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("shop.example.com", config)
|
urls = await seeder.urls("shop.example.com", config)
|
||||||
|
|
||||||
# Filter further by price (from metadata)
|
# Filter further by price (from metadata)
|
||||||
affordable = [
|
affordable = [
|
||||||
@@ -550,8 +551,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.35
|
score_threshold=0.35
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("technews.com", config)
|
urls = await seeder.urls("technews.com", config)
|
||||||
|
|
||||||
# Filter by date
|
# Filter by date
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@@ -591,8 +592,8 @@ for query in queries:
|
|||||||
score_threshold=0.4,
|
score_threshold=0.4,
|
||||||
max_urls=10 # Top 10 per topic
|
max_urls=10 # Top 10 per topic
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("learning-platform.com", config)
|
urls = await seeder.urls("learning-platform.com", config)
|
||||||
all_tutorials.extend(urls)
|
all_tutorials.extend(urls)
|
||||||
|
|
||||||
# Remove duplicates while preserving order
|
# Remove duplicates while preserving order
|
||||||
@@ -625,7 +626,8 @@ config = SeedingConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Returns a dictionary: {domain: [urls]}
|
# Returns a dictionary: {domain: [urls]}
|
||||||
results = await seeder.many_urls(domains, config)
|
async with AsyncUrlSeeder() as seeder:
|
||||||
|
results = await seeder.many_urls(domains, config)
|
||||||
|
|
||||||
# Process results
|
# Process results
|
||||||
for domain, urls in results.items():
|
for domain, urls in results.items():
|
||||||
@@ -654,8 +656,8 @@ config = SeedingConfig(
|
|||||||
pattern="*/blog/*",
|
pattern="*/blog/*",
|
||||||
max_urls=100
|
max_urls=100
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(competitors, config)
|
results = await seeder.many_urls(competitors, config)
|
||||||
|
|
||||||
# Analyze content types
|
# Analyze content types
|
||||||
for domain, urls in results.items():
|
for domain, urls in results.items():
|
||||||
@@ -690,8 +692,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.3,
|
score_threshold=0.3,
|
||||||
max_urls=20 # Per site
|
max_urls=20 # Per site
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(educational_sites, config)
|
results = await seeder.many_urls(educational_sites, config)
|
||||||
|
|
||||||
# Find the best beginner tutorials
|
# Find the best beginner tutorials
|
||||||
all_tutorials = []
|
all_tutorials = []
|
||||||
@@ -731,8 +733,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.5, # High threshold for relevance
|
score_threshold=0.5, # High threshold for relevance
|
||||||
max_urls=10
|
max_urls=10
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(news_sites, config)
|
results = await seeder.many_urls(news_sites, config)
|
||||||
|
|
||||||
# Collect all mentions
|
# Collect all mentions
|
||||||
mentions = []
|
mentions = []
|
||||||
|
|||||||
Reference in New Issue
Block a user