Apply Ruff Corrections
This commit is contained in:
@@ -9,13 +9,11 @@ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy
|
||||
crawler_config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
@@ -27,74 +25,70 @@ async def extract_amazon_products():
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
"attribute": "data-asin",
|
||||
},
|
||||
{"name": "title", "selector": "h2 a span", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
"attribute": "href",
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
"attribute": "src",
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
"type": "exists",
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
"multiple": True,
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
|
||||
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
@@ -105,10 +99,12 @@ async def extract_amazon_products():
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
if product.get("delivery_info"):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(extract_amazon_products())
|
||||
|
||||
@@ -10,17 +10,17 @@ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
@@ -30,102 +30,105 @@ async def extract_amazon_products():
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
"attribute": "data-asin",
|
||||
},
|
||||
{"name": "title", "selector": "h2 a span", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
"attribute": "href",
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
"attribute": "src",
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
"type": "exists",
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
"multiple": True,
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
|
||||
async def after_goto(
|
||||
page: Page, context: BrowserContext, url: str, response: dict, **kwargs
|
||||
):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
|
||||
|
||||
try:
|
||||
# Wait for search box to be available
|
||||
search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
|
||||
|
||||
search_box = await page.wait_for_selector(
|
||||
"#twotabsearchtextbox", timeout=1000
|
||||
)
|
||||
|
||||
# Type the search query
|
||||
await search_box.fill('Samsung Galaxy Tab')
|
||||
|
||||
await search_box.fill("Samsung Galaxy Tab")
|
||||
|
||||
# Get the search button and prepare for navigation
|
||||
search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
|
||||
|
||||
search_button = await page.wait_for_selector(
|
||||
"#nav-search-submit-button", timeout=1000
|
||||
)
|
||||
|
||||
# Click with navigation waiting
|
||||
await search_button.click()
|
||||
|
||||
|
||||
# Wait for search results to load
|
||||
await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
|
||||
await page.wait_for_selector(
|
||||
'[data-component-type="s-search-result"]', timeout=10000
|
||||
)
|
||||
print("[HOOK] Search completed and results loaded!")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"[HOOK] Error during search operation: {str(e)}")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
return page
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||||
|
||||
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
@@ -136,10 +139,12 @@ async def extract_amazon_products():
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
if product.get("delivery_info"):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(extract_amazon_products())
|
||||
|
||||
@@ -8,7 +8,7 @@ from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
@@ -16,7 +16,7 @@ async def extract_amazon_products():
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
|
||||
js_code_to_search = """
|
||||
const task = async () => {
|
||||
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
|
||||
@@ -30,7 +30,7 @@ async def extract_amazon_products():
|
||||
"""
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code = js_code_to_search,
|
||||
js_code=js_code_to_search,
|
||||
wait_for='css:[data-component-type="s-search-result"]',
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
@@ -41,75 +41,70 @@ async def extract_amazon_products():
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
"attribute": "data-asin",
|
||||
},
|
||||
{"name": "title", "selector": "h2 a span", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
"attribute": "href",
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
"attribute": "src",
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
"type": "exists",
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
"multiple": True,
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
@@ -120,10 +115,12 @@ async def extract_amazon_products():
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
if product.get("delivery_info"):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(extract_amazon_products())
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
# File: async_webcrawler_multiple_urls_example.py
|
||||
import os, sys
|
||||
|
||||
# append 2 parent directories to sys.path to import crawl4ai
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
parent_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
|
||||
async def main():
|
||||
# Initialize the AsyncWebCrawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
@@ -16,7 +20,7 @@ async def main():
|
||||
"https://python.org",
|
||||
"https://github.com",
|
||||
"https://stackoverflow.com",
|
||||
"https://news.ycombinator.com"
|
||||
"https://news.ycombinator.com",
|
||||
]
|
||||
|
||||
# Set up crawling parameters
|
||||
@@ -27,7 +31,7 @@ async def main():
|
||||
urls=urls,
|
||||
word_count_threshold=word_count_threshold,
|
||||
bypass_cache=True,
|
||||
verbose=True
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Process the results
|
||||
@@ -36,7 +40,9 @@ async def main():
|
||||
print(f"Successfully crawled: {result.url}")
|
||||
print(f"Title: {result.metadata.get('title', 'N/A')}")
|
||||
print(f"Word count: {len(result.markdown.split())}")
|
||||
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
|
||||
print(
|
||||
f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
|
||||
)
|
||||
print(f"Number of images: {len(result.media.get('images', []))}")
|
||||
print("---")
|
||||
else:
|
||||
@@ -44,5 +50,6 @@ async def main():
|
||||
print(f"Error: {result.error_message}")
|
||||
print("---")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -6,10 +6,8 @@ This example demonstrates optimal browser usage patterns in Crawl4AI:
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
|
||||
@@ -1,31 +1,32 @@
|
||||
import os, time
|
||||
|
||||
# append the path to the root of the project
|
||||
import sys
|
||||
import asyncio
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
from firecrawl import FirecrawlApp
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
|
||||
|
||||
__data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
|
||||
|
||||
|
||||
async def compare():
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
|
||||
# Tet Firecrawl with a simple crawl
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(scrape_status['markdown']))
|
||||
print(len(scrape_status["markdown"]))
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/firecrawl_simple.md", "w") as f:
|
||||
f.write(scrape_status['markdown'])
|
||||
f.write(scrape_status["markdown"])
|
||||
# Count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
|
||||
|
||||
|
||||
print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start = time.time()
|
||||
@@ -33,13 +34,13 @@ async def compare():
|
||||
url="https://www.nbcnews.com/business",
|
||||
# js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
bypass_cache=True,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(result.markdown))
|
||||
# save the markdown content with provider name
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
@@ -48,10 +49,12 @@ async def compare():
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
js_code=[
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
bypass_cache=True,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
@@ -61,7 +64,7 @@ async def compare():
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(result.markdown.count("cldnry.s-nbcnews.com"))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(compare())
|
||||
|
||||
@@ -3,11 +3,18 @@ import time
|
||||
from rich import print
|
||||
from rich.table import Table
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
|
||||
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
RateLimiter,
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
CacheMode,
|
||||
)
|
||||
|
||||
|
||||
async def memory_adaptive(urls, browser_config, run_config):
|
||||
"""Memory adaptive crawler with monitoring"""
|
||||
start = time.perf_counter()
|
||||
@@ -16,14 +23,16 @@ async def memory_adaptive(urls, browser_config, run_config):
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=10,
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||||
),
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls, config=run_config, dispatcher=dispatcher
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
|
||||
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||||
"""Memory adaptive crawler with rate limiting"""
|
||||
start = time.perf_counter()
|
||||
@@ -32,19 +41,19 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=10,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2
|
||||
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
||||
),
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||||
),
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls, config=run_config, dispatcher=dispatcher
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
|
||||
async def semaphore(urls, browser_config, run_config):
|
||||
"""Basic semaphore crawler"""
|
||||
start = time.perf_counter()
|
||||
@@ -52,14 +61,16 @@ async def semaphore(urls, browser_config, run_config):
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=5,
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||||
),
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls, config=run_config, dispatcher=dispatcher
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
|
||||
async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
||||
"""Semaphore crawler with rate limiting"""
|
||||
start = time.perf_counter()
|
||||
@@ -67,19 +78,19 @@ async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=5,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2
|
||||
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
||||
),
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||||
),
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls, config=run_config, dispatcher=dispatcher
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
|
||||
def create_performance_table(results):
|
||||
"""Creates a rich table showing performance results"""
|
||||
table = Table(title="Crawler Strategy Performance Comparison")
|
||||
@@ -89,18 +100,16 @@ def create_performance_table(results):
|
||||
table.add_column("URLs/second", justify="right", style="magenta")
|
||||
|
||||
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
|
||||
|
||||
|
||||
for strategy, (urls_crawled, duration) in sorted_results:
|
||||
urls_per_second = urls_crawled / duration
|
||||
table.add_row(
|
||||
strategy,
|
||||
str(urls_crawled),
|
||||
f"{duration:.2f}",
|
||||
f"{urls_per_second:.2f}"
|
||||
strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
|
||||
)
|
||||
|
||||
|
||||
return table
|
||||
|
||||
|
||||
async def main():
|
||||
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
@@ -108,14 +117,19 @@ async def main():
|
||||
|
||||
results = {
|
||||
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||||
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
|
||||
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
|
||||
urls, browser_config, run_config
|
||||
),
|
||||
"Semaphore": await semaphore(urls, browser_config, run_config),
|
||||
"Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
|
||||
"Semaphore + Rate Limit": await semaphore_with_rate_limit(
|
||||
urls, browser_config, run_config
|
||||
),
|
||||
}
|
||||
|
||||
table = create_performance_table(results)
|
||||
print("\nPerformance Summary:")
|
||||
print(table)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -6,63 +6,80 @@ import base64
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
self.api_token = (
|
||||
api_token or os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
||||
) # Check environment variable as fallback
|
||||
self.headers = (
|
||||
{"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
|
||||
)
|
||||
|
||||
def submit_and_wait(
|
||||
self, request_data: Dict[str, Any], timeout: int = 300
|
||||
) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl", json=request_data, headers=self.headers
|
||||
)
|
||||
if response.status_code == 403:
|
||||
raise Exception("API token is invalid or missing")
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
|
||||
raise TimeoutError(
|
||||
f"Task {task_id} did not complete within {timeout} seconds"
|
||||
)
|
||||
|
||||
result = requests.get(
|
||||
f"{self.base_url}/task/{task_id}", headers=self.headers
|
||||
)
|
||||
status = result.json()
|
||||
|
||||
|
||||
if status["status"] == "failed":
|
||||
print("Task failed:", status.get("error"))
|
||||
raise Exception(f"Task failed: {status.get('error')}")
|
||||
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_sync",
|
||||
json=request_data,
|
||||
headers=self.headers,
|
||||
timeout=60,
|
||||
)
|
||||
if response.status_code == 408:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Directly crawl without using task queue"""
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_direct",
|
||||
json=request_data,
|
||||
headers=self.headers
|
||||
f"{self.base_url}/crawl_direct", json=request_data, headers=self.headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
base_url="http://localhost:11235" ,
|
||||
base_url="http://localhost:11235",
|
||||
# base_url="https://api.crawl4ai.com" # just for example
|
||||
# api_token="test" # just for example
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
|
||||
# Health check with timeout and retry
|
||||
max_retries = 5
|
||||
for i in range(max_retries):
|
||||
@@ -70,19 +87,19 @@ def test_docker_deployment(version="basic"):
|
||||
health = requests.get(f"{tester.base_url}/health", timeout=10)
|
||||
print("Health check:", health.json())
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
except requests.exceptions.RequestException:
|
||||
if i == max_retries - 1:
|
||||
print(f"Failed to connect after {max_retries} attempts")
|
||||
sys.exit(1)
|
||||
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl_direct(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
|
||||
|
||||
if version in ["full", "transformer"]:
|
||||
test_cosine_extraction(tester)
|
||||
|
||||
@@ -92,49 +109,52 @@ def test_docker_deployment(version="basic"):
|
||||
test_llm_extraction(tester)
|
||||
test_llm_with_ollama(tester)
|
||||
test_screenshot(tester)
|
||||
|
||||
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
"session_id": "test",
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_sync(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['status'] == 'completed'
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
assert result["status"] == "completed"
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_basic_crawl_direct(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Direct) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
# "session_id": "test"
|
||||
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
|
||||
"cache_mode": "bypass", # or "enabled", "disabled", "read_only", "write_only"
|
||||
}
|
||||
|
||||
|
||||
result = tester.crawl_direct(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
@@ -144,32 +164,29 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
"wait_for": "article.tease-card:nth-child(10)",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"JS execution result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
},
|
||||
"extra": {"word_count_threshold": 10}
|
||||
|
||||
"crawler_params": {"headless": True},
|
||||
"extra": {"word_count_threshold": 10},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"CSS selector result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
def test_structured_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Structured Extraction ===")
|
||||
schema = {
|
||||
@@ -190,21 +207,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"priority": 9,
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": schema
|
||||
}
|
||||
}
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} items")
|
||||
@@ -212,6 +224,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
assert result["result"]["success"]
|
||||
assert len(extracted) > 0
|
||||
|
||||
|
||||
def test_llm_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM Extraction ===")
|
||||
schema = {
|
||||
@@ -219,20 +232,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
"properties": {
|
||||
"model_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the OpenAI model."
|
||||
"description": "Name of the OpenAI model.",
|
||||
},
|
||||
"input_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for input token for the OpenAI model."
|
||||
"description": "Fee for input token for the OpenAI model.",
|
||||
},
|
||||
"output_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for output token for the OpenAI model."
|
||||
}
|
||||
"description": "Fee for output token for the OpenAI model.",
|
||||
},
|
||||
},
|
||||
"required": ["model_name", "input_fee", "output_fee"]
|
||||
"required": ["model_name", "input_fee", "output_fee"],
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"priority": 8,
|
||||
@@ -243,12 +256,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
"api_token": os.getenv("OPENAI_API_KEY"),
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
|
||||
}
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
|
||||
},
|
||||
},
|
||||
"crawler_params": {"word_count_threshold": 1}
|
||||
"crawler_params": {"word_count_threshold": 1},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -258,6 +271,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
|
||||
|
||||
|
||||
def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM with Ollama ===")
|
||||
schema = {
|
||||
@@ -265,20 +279,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
"properties": {
|
||||
"article_title": {
|
||||
"type": "string",
|
||||
"description": "The main title of the news article"
|
||||
"description": "The main title of the news article",
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "A brief summary of the article content"
|
||||
"description": "A brief summary of the article content",
|
||||
},
|
||||
"main_topics": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Main topics or themes discussed in the article"
|
||||
}
|
||||
}
|
||||
"description": "Main topics or themes discussed in the article",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
@@ -288,13 +302,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
"provider": "ollama/llama2",
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": "Extract the main article information including title, summary, and main topics."
|
||||
}
|
||||
"instruction": "Extract the main article information including title, summary, and main topics.",
|
||||
},
|
||||
},
|
||||
"extra": {"word_count_threshold": 1},
|
||||
"crawler_params": {"verbose": True}
|
||||
"crawler_params": {"verbose": True},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -303,6 +317,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"Ollama extraction test failed: {str(e)}")
|
||||
|
||||
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
@@ -314,11 +329,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
"semantic_filter": "business finance economy",
|
||||
"word_count_threshold": 10,
|
||||
"max_dist": 0.2,
|
||||
"top_k": 3
|
||||
}
|
||||
}
|
||||
"top_k": 3,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -328,30 +343,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"Cosine extraction test failed: {str(e)}")
|
||||
|
||||
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print("Screenshot captured:", bool(result["result"]["screenshot"]))
|
||||
|
||||
|
||||
if result["result"]["screenshot"]:
|
||||
# Save screenshot
|
||||
screenshot_data = base64.b64decode(result["result"]["screenshot"])
|
||||
with open("test_screenshot.jpg", "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print("Screenshot saved as test_screenshot.jpg")
|
||||
|
||||
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
|
||||
# version = "full"
|
||||
test_docker_deployment(version)
|
||||
test_docker_deployment(version)
|
||||
|
||||
@@ -9,18 +9,17 @@ This example shows how to:
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy
|
||||
JsonXPathExtractionStrategy,
|
||||
)
|
||||
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
|
||||
"""Helper function to run extraction with proper configuration"""
|
||||
try:
|
||||
@@ -30,78 +29,90 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
|
||||
extraction_strategy=strategy,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter() # For fit_markdown support
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# Run the crawler
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
|
||||
if result.success:
|
||||
print(f"\n=== {name} Results ===")
|
||||
print(f"Extracted Content: {result.extracted_content}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
print(
|
||||
f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
|
||||
)
|
||||
else:
|
||||
print(f"Error in {name}: Crawl failed")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in {name}: {str(e)}")
|
||||
|
||||
|
||||
async def main():
|
||||
# Example URL (replace with actual URL)
|
||||
url = "https://example.com/product-page"
|
||||
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
|
||||
# Initialize extraction strategies
|
||||
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information including name, price, and description"
|
||||
instruction="Extract product information including name, price, and description",
|
||||
)
|
||||
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from HTML including structured data"
|
||||
instruction="Extract product information from HTML including structured data",
|
||||
)
|
||||
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from cleaned markdown"
|
||||
instruction="Extract product information from cleaned markdown",
|
||||
)
|
||||
|
||||
|
||||
# 2. JSON CSS Extraction (automatically uses HTML input)
|
||||
css_schema = {
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1.product-title", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "description", "selector": ".description", "type": "text"}
|
||||
]
|
||||
{"name": "description", "selector": ".description", "type": "text"},
|
||||
],
|
||||
}
|
||||
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
||||
|
||||
|
||||
# 3. JSON XPath Extraction (automatically uses HTML input)
|
||||
xpath_schema = {
|
||||
"baseSelector": "//div[@class='product']",
|
||||
"fields": [
|
||||
{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
|
||||
{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
|
||||
{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
|
||||
]
|
||||
{
|
||||
"name": "title",
|
||||
"selector": ".//h1[@class='product-title']/text()",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".//span[@class='price']/text()",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "description",
|
||||
"selector": ".//div[@class='description']/text()",
|
||||
"type": "text",
|
||||
},
|
||||
],
|
||||
}
|
||||
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
|
||||
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run all strategies
|
||||
@@ -111,5 +122,6 @@ async def main():
|
||||
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
|
||||
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,20 +1,23 @@
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
)
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
result = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
|
||||
async def main():
|
||||
print("🔗 Hooks Example: Demonstrating different hook use cases")
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Configure crawler settings
|
||||
crawler_run_config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="body",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
# Create crawler instance
|
||||
@@ -30,16 +29,22 @@ async def main():
|
||||
"""Hook called after a new page and context are created"""
|
||||
print("[HOOK] on_page_context_created - New page created!")
|
||||
# Example: Set default viewport size
|
||||
await context.add_cookies([{
|
||||
'name': 'session_id',
|
||||
'value': 'example_session',
|
||||
'domain': '.example.com',
|
||||
'path': '/'
|
||||
}])
|
||||
await context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "session_id",
|
||||
"value": "example_session",
|
||||
"domain": ".example.com",
|
||||
"path": "/",
|
||||
}
|
||||
]
|
||||
)
|
||||
await page.set_viewport_size({"width": 1080, "height": 800})
|
||||
return page
|
||||
|
||||
async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
|
||||
async def on_user_agent_updated(
|
||||
page: Page, context: BrowserContext, user_agent: str, **kwargs
|
||||
):
|
||||
"""Hook called when the user agent is updated"""
|
||||
print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
|
||||
return page
|
||||
@@ -53,17 +58,17 @@ async def main():
|
||||
"""Hook called before navigating to each URL"""
|
||||
print(f"[HOOK] before_goto - About to visit: {url}")
|
||||
# Example: Add custom headers for the request
|
||||
await page.set_extra_http_headers({
|
||||
"Custom-Header": "my-value"
|
||||
})
|
||||
await page.set_extra_http_headers({"Custom-Header": "my-value"})
|
||||
return page
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
async def after_goto(
|
||||
page: Page, context: BrowserContext, url: str, response: dict, **kwargs
|
||||
):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
# Example: Wait for a specific element to be loaded
|
||||
try:
|
||||
await page.wait_for_selector('.content', timeout=1000)
|
||||
await page.wait_for_selector(".content", timeout=1000)
|
||||
print("Content element found!")
|
||||
except:
|
||||
print("Content element not found, continuing anyway")
|
||||
@@ -76,7 +81,9 @@ async def main():
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
return page
|
||||
|
||||
async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
|
||||
async def before_return_html(
|
||||
page: Page, context: BrowserContext, html: str, **kwargs
|
||||
):
|
||||
"""Hook called before returning the HTML content"""
|
||||
print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
|
||||
# Example: You could modify the HTML content here if needed
|
||||
@@ -84,7 +91,9 @@ async def main():
|
||||
|
||||
# Set all the hooks
|
||||
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
||||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||
crawler.crawler_strategy.set_hook(
|
||||
"on_page_context_created", on_page_context_created
|
||||
)
|
||||
crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
||||
@@ -95,13 +104,15 @@ async def main():
|
||||
await crawler.start()
|
||||
|
||||
# Example usage: crawl a simple website
|
||||
url = 'https://example.com'
|
||||
url = "https://example.com"
|
||||
result = await crawler.arun(url, config=crawler_run_config)
|
||||
print(f"\nCrawled URL: {result.url}")
|
||||
print(f"HTML length: {len(result.html)}")
|
||||
|
||||
|
||||
await crawler.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Example 1: Setting language when creating the crawler
|
||||
crawler1 = AsyncWebCrawler(
|
||||
@@ -9,11 +10,15 @@ async def main():
|
||||
)
|
||||
)
|
||||
result1 = await crawler1.arun("https://www.example.com")
|
||||
print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters
|
||||
print(
|
||||
"Example 1 result:", result1.extracted_content[:100]
|
||||
) # Print first 100 characters
|
||||
|
||||
# Example 2: Setting language before crawling
|
||||
crawler2 = AsyncWebCrawler()
|
||||
crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
|
||||
crawler2.crawler_strategy.headers[
|
||||
"Accept-Language"
|
||||
] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
|
||||
result2 = await crawler2.arun("https://www.example.com")
|
||||
print("Example 2 result:", result2.extracted_content[:100])
|
||||
|
||||
@@ -21,7 +26,7 @@ async def main():
|
||||
crawler3 = AsyncWebCrawler()
|
||||
result3 = await crawler3.arun(
|
||||
"https://www.example.com",
|
||||
headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
|
||||
headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
|
||||
)
|
||||
print("Example 3 result:", result3.extracted_content[:100])
|
||||
|
||||
@@ -31,15 +36,15 @@ async def main():
|
||||
("https://www.example.org", "es-ES,es;q=0.9"),
|
||||
("https://www.example.net", "de-DE,de;q=0.9"),
|
||||
]
|
||||
|
||||
|
||||
crawler4 = AsyncWebCrawler()
|
||||
results = await asyncio.gather(*[
|
||||
crawler4.arun(url, headers={"Accept-Language": lang})
|
||||
for url, lang in urls
|
||||
])
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
|
||||
)
|
||||
|
||||
for url, result in zip([u for u, _ in urls], results):
|
||||
print(f"Result for {url}:", result.extracted_content[:100])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -3,32 +3,37 @@ from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
url = r'https://openai.com/api/pricing/'
|
||||
url = r"https://openai.com/api/pricing/"
|
||||
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
|
||||
async def main():
|
||||
# Use AsyncWebCrawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
|
||||
provider="groq/llama-3.1-70b-versatile",
|
||||
api_token=os.getenv("GROQ_API_KEY"),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their " \
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. " \
|
||||
'One extracted model JSON format should look like this: ' \
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
||||
"One extracted model JSON format should look like this: "
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
|
||||
),
|
||||
|
||||
)
|
||||
print("Success:", result.success)
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
@@ -37,4 +42,5 @@ async def main():
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -8,12 +8,12 @@ import asyncio
|
||||
import time
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
@@ -62,6 +62,7 @@ async def clean_content():
|
||||
print(f"Full Markdown Length: {full_markdown_length}")
|
||||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||||
|
||||
|
||||
async def link_analysis():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
@@ -76,9 +77,10 @@ async def link_analysis():
|
||||
print(f"Found {len(result.links['internal'])} internal links")
|
||||
print(f"Found {len(result.links['external'])} external links")
|
||||
|
||||
for link in result.links['internal'][:5]:
|
||||
for link in result.links["internal"][:5]:
|
||||
print(f"Href: {link['href']}\nText: {link['text']}\n")
|
||||
|
||||
|
||||
# JavaScript Execution Example
|
||||
async def simple_example_with_running_js_code():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
@@ -112,25 +114,29 @@ async def simple_example_with_css_selector():
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
async def media_handling():
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
for img in result.media['images'][:5]:
|
||||
for img in result.media["images"][:5]:
|
||||
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
|
||||
|
||||
|
||||
async def custom_hook_workflow(verbose=True):
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Set a 'before_goto' hook to run custom code just before navigation
|
||||
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
|
||||
crawler.crawler_strategy.set_hook(
|
||||
"before_goto",
|
||||
lambda page, context: print("[Hook] Preparing to navigate..."),
|
||||
)
|
||||
|
||||
# Perform the crawl operation
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com"
|
||||
)
|
||||
result = await crawler.arun(url="https://crawl4ai.com")
|
||||
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
|
||||
|
||||
|
||||
@@ -412,21 +418,22 @@ async def cosine_similarity_extraction():
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=CosineStrategy(
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2, # Maximum distance between two words
|
||||
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
||||
top_k=3, # Number of top keywords to extract
|
||||
sim_threshold=0.3, # Similarity threshold for clustering
|
||||
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
||||
verbose=True
|
||||
),
|
||||
max_dist=0.2, # Maximum distance between two words
|
||||
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
||||
top_k=3, # Number of top keywords to extract
|
||||
sim_threshold=0.3, # Similarity threshold for clustering
|
||||
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
||||
verbose=True,
|
||||
),
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
|
||||
config=crawl_config
|
||||
config=crawl_config,
|
||||
)
|
||||
print(json.loads(result.extracted_content)[:5])
|
||||
|
||||
|
||||
# Browser Comparison
|
||||
async def crawl_custom_browser_type():
|
||||
print("\n--- Browser Comparison ---")
|
||||
@@ -484,39 +491,42 @@ async def crawl_with_user_simulation():
|
||||
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
|
||||
print(result.markdown)
|
||||
|
||||
|
||||
async def ssl_certification():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
|
||||
cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=config
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
print(f"Valid until: {cert.valid_until}")
|
||||
print(f"Fingerprint: {cert.fingerprint}")
|
||||
|
||||
|
||||
# 2. Export certificate in different formats
|
||||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||||
print("\nCertificate exported to:")
|
||||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||||
|
||||
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
|
||||
|
||||
pem_data = cert.to_pem(
|
||||
os.path.join(tmp_dir, "certificate.pem")
|
||||
) # For web servers
|
||||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||||
|
||||
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
|
||||
|
||||
der_data = cert.to_der(
|
||||
os.path.join(tmp_dir, "certificate.der")
|
||||
) # For Java apps
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
|
||||
# Speed Comparison
|
||||
async def speed_comparison():
|
||||
print("\n--- Speed Comparison ---")
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import os, sys
|
||||
|
||||
# append parent directory to system path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
|
||||
|
||||
import asyncio
|
||||
# import nest_asyncio
|
||||
@@ -15,7 +19,7 @@ from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
@@ -32,9 +36,12 @@ print("Website: https://crawl4ai.com")
|
||||
async def simple_crawl():
|
||||
print("\n--- Basic Usage ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
|
||||
async def simple_example_with_running_js_code():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
# New code to handle the wait_for parameter
|
||||
@@ -57,6 +64,7 @@ async def simple_example_with_running_js_code():
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
|
||||
async def simple_example_with_css_selector():
|
||||
print("\n--- Using CSS Selectors ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
@@ -67,42 +75,44 @@ async def simple_example_with_css_selector():
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
|
||||
async def use_proxy():
|
||||
print("\n--- Using a Proxy ---")
|
||||
print(
|
||||
"Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
|
||||
)
|
||||
# Uncomment and modify the following lines to use a proxy
|
||||
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||
async with AsyncWebCrawler(
|
||||
verbose=True, proxy="http://your-proxy-url:port"
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
cache_mode= CacheMode.BYPASS
|
||||
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
if result.success:
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
screenshot=True,
|
||||
cache_mode= CacheMode.BYPASS
|
||||
url=url, screenshot=True, cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
|
||||
|
||||
# Decode the base64 screenshot data
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
|
||||
|
||||
# Save the screenshot as a JPEG file
|
||||
with open(output_path, 'wb') as f:
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
|
||||
|
||||
print(f"Screenshot saved successfully to {output_path}")
|
||||
else:
|
||||
print("Failed to capture screenshot")
|
||||
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
@@ -110,16 +120,19 @@ class OpenAIModelFee(BaseModel):
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
|
||||
|
||||
async def extract_structured_data_using_llm(
|
||||
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
|
||||
):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
# extra_args = {}
|
||||
extra_args={
|
||||
"temperature": 0,
|
||||
extra_args = {
|
||||
"temperature": 0,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 2000,
|
||||
# any other supported parameters for litellm
|
||||
@@ -139,52 +152,49 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
||||
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
|
||||
extra_args=extra_args
|
||||
extra_args=extra_args,
|
||||
),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
schema = {
|
||||
"name": "KidoCode Courses",
|
||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||
"fields": [
|
||||
{
|
||||
"name": "section_title",
|
||||
"selector": "h3.heading-50",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "section_description",
|
||||
"selector": ".charge-content",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_name",
|
||||
"selector": ".text-block-93",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_description",
|
||||
"selector": ".course-content-text",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
}
|
||||
]
|
||||
}
|
||||
"name": "KidoCode Courses",
|
||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||
"fields": [
|
||||
{
|
||||
"name": "section_title",
|
||||
"selector": "h3.heading-50",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "section_description",
|
||||
"selector": ".charge-content",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_name",
|
||||
"selector": ".text-block-93",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_description",
|
||||
"selector": ".course-content-text",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
# Create the JavaScript that handles clicking multiple times
|
||||
js_click_tabs = """
|
||||
(async () => {
|
||||
@@ -198,19 +208,20 @@ async def extract_structured_data_using_css_extractor():
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
})();
|
||||
"""
|
||||
"""
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://www.kidocode.com/degrees/technology",
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
|
||||
js_code=[js_click_tabs],
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
companies = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(companies)} companies")
|
||||
print(json.dumps(companies[0], indent=2))
|
||||
|
||||
|
||||
# Advanced Session-Based Crawling with Dynamic Content 🔄
|
||||
async def crawl_dynamic_content_pages_method_1():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
@@ -267,6 +278,7 @@ async def crawl_dynamic_content_pages_method_1():
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
async def crawl_dynamic_content_pages_method_2():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
|
||||
@@ -334,8 +346,11 @@ async def crawl_dynamic_content_pages_method_2():
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
async def crawl_dynamic_content_pages_method_3():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
|
||||
print(
|
||||
"\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
@@ -357,7 +372,7 @@ async def crawl_dynamic_content_pages_method_3():
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.firstCommit;
|
||||
}"""
|
||||
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
@@ -395,40 +410,53 @@ async def crawl_dynamic_content_pages_method_3():
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
async def crawl_custom_browser_type():
|
||||
# Use Firefox
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="firefox", verbose=True, headless=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com", cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
# Use WebKit
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="webkit", verbose=True, headless=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com", cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
# Use Chromium (default)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com", cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
|
||||
async def crawl_with_user_simultion():
|
||||
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||
url = "YOUR-URL-HERE"
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
url=url,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
|
||||
magic=True, # Automatically detects and removes overlays, popups, and other elements that block content
|
||||
# simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
|
||||
# override_navigator = True # Overrides the navigator object to make it look like a real user
|
||||
)
|
||||
|
||||
print(result.markdown)
|
||||
|
||||
print(result.markdown)
|
||||
|
||||
|
||||
async def speed_comparison():
|
||||
# print("\n--- Speed Comparison ---")
|
||||
@@ -439,18 +467,18 @@ async def speed_comparison():
|
||||
# print()
|
||||
# Simulated Firecrawl performance
|
||||
from firecrawl import FirecrawlApp
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print("Firecrawl:")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(scrape_status['markdown'])} characters")
|
||||
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
print()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl4AI simple crawl
|
||||
@@ -474,7 +502,9 @@ async def speed_comparison():
|
||||
url="https://www.nbcnews.com/business",
|
||||
word_count_threshold=0,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
|
||||
),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
@@ -498,7 +528,9 @@ async def speed_comparison():
|
||||
word_count_threshold=0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
|
||||
),
|
||||
verbose=False,
|
||||
@@ -520,11 +552,12 @@ async def speed_comparison():
|
||||
print("If you run these tests in an environment with better network conditions,")
|
||||
print("you may observe an even more significant speed advantage for Crawl4AI.")
|
||||
|
||||
|
||||
async def generate_knowledge_graph():
|
||||
class Entity(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
|
||||
|
||||
class Relationship(BaseModel):
|
||||
entity1: Entity
|
||||
entity2: Entity
|
||||
@@ -536,11 +569,11 @@ async def generate_knowledge_graph():
|
||||
relationships: List[Relationship]
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
|
||||
api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text."""
|
||||
provider="openai/gpt-4o-mini", # Or any other provider, including Ollama and open source models
|
||||
api_token=os.getenv("OPENAI_API_KEY"), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text.""",
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
url = "https://paulgraham.com/love.html"
|
||||
@@ -554,27 +587,22 @@ async def generate_knowledge_graph():
|
||||
with open(os.path.join(__location__, "kb.json"), "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
|
||||
async def fit_markdown_remove_overlay():
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
headless=True, # Set to False to see what is happening
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={
|
||||
"device_type": "mobile",
|
||||
"os_type": "android"
|
||||
},
|
||||
headless=True, # Set to False to see what is happening
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://www.kidocode.com/degrees/technology',
|
||||
url="https://www.kidocode.com/degrees/technology",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
),
|
||||
options={
|
||||
"ignore_links": True
|
||||
}
|
||||
options={"ignore_links": True},
|
||||
),
|
||||
# markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
|
||||
@@ -583,31 +611,38 @@ async def fit_markdown_remove_overlay():
|
||||
# }
|
||||
# ),
|
||||
)
|
||||
|
||||
|
||||
if result.success:
|
||||
print(len(result.markdown_v2.raw_markdown))
|
||||
print(len(result.markdown_v2.markdown_with_citations))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
|
||||
|
||||
# Save clean html
|
||||
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
|
||||
f.write(result.cleaned_html)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_raw_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_markdown_with_citations.md"),
|
||||
"w",
|
||||
) as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_fit_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
|
||||
print("Done")
|
||||
|
||||
|
||||
async def main():
|
||||
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
|
||||
# await simple_crawl()
|
||||
# await simple_example_with_running_js_code()
|
||||
# await simple_example_with_css_selector()
|
||||
@@ -618,7 +653,7 @@ async def main():
|
||||
# LLM extraction examples
|
||||
# await extract_structured_data_using_llm()
|
||||
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||
|
||||
# You always can pass custom headers to the extraction strategy
|
||||
# custom_headers = {
|
||||
@@ -626,13 +661,13 @@ async def main():
|
||||
# "X-Custom-Header": "Some-Value"
|
||||
# }
|
||||
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
||||
|
||||
|
||||
# await crawl_dynamic_content_pages_method_1()
|
||||
# await crawl_dynamic_content_pages_method_2()
|
||||
await crawl_dynamic_content_pages_method_3()
|
||||
|
||||
|
||||
# await crawl_custom_browser_type()
|
||||
|
||||
|
||||
# await speed_comparison()
|
||||
|
||||
|
||||
|
||||
@@ -10,15 +10,17 @@ from functools import lru_cache
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def create_crawler():
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
|
||||
def print_result(result):
|
||||
# Print each key in one line and just the first 10 characters of each one's value and three dots
|
||||
console.print(f"\t[bold]Result:[/bold]")
|
||||
console.print("\t[bold]Result:[/bold]")
|
||||
for key, value in result.model_dump().items():
|
||||
if isinstance(value, str) and value:
|
||||
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
|
||||
@@ -33,18 +35,27 @@ def cprint(message, press_any_key=False):
|
||||
console.print("Press any key to continue...", style="")
|
||||
input()
|
||||
|
||||
|
||||
def basic_usage(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
|
||||
cprint(
|
||||
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
|
||||
)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def basic_usage_some_params(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
|
||||
cprint(
|
||||
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def screenshot_usage(crawler):
|
||||
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||
@@ -55,16 +66,23 @@ def screenshot_usage(crawler):
|
||||
cprint("Screenshot saved to 'screenshot.png'!")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def understanding_parameters(crawler):
|
||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||
|
||||
cprint(
|
||||
"\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
|
||||
)
|
||||
cprint(
|
||||
"By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
|
||||
)
|
||||
|
||||
# First crawl (reads from cache)
|
||||
cprint("1️⃣ First crawl (caches the result):", True)
|
||||
start_time = time.time()
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
end_time = time.time()
|
||||
cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
|
||||
cprint(
|
||||
f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
|
||||
)
|
||||
print_result(result)
|
||||
|
||||
# Force to crawl again
|
||||
@@ -72,169 +90,232 @@ def understanding_parameters(crawler):
|
||||
start_time = time.time()
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||
end_time = time.time()
|
||||
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
|
||||
cprint(
|
||||
f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
|
||||
)
|
||||
print_result(result)
|
||||
|
||||
|
||||
def add_chunking_strategy(crawler):
|
||||
# Adding a chunking strategy: RegexChunking
|
||||
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
||||
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
|
||||
cprint(
|
||||
"\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"]),
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Adding another chunking strategy: NlpSentenceChunking
|
||||
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
|
||||
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
|
||||
cprint(
|
||||
"\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def add_extraction_strategy(crawler):
|
||||
# Adding an extraction strategy: CosineStrategy
|
||||
cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
|
||||
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
||||
cprint(
|
||||
"\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
|
||||
extraction_strategy=CosineStrategy(
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2,
|
||||
linkage_method="ward",
|
||||
top_k=3,
|
||||
sim_threshold=0.3,
|
||||
verbose=True,
|
||||
),
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
# Using semantic_filter with CosineStrategy
|
||||
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
|
||||
cprint(
|
||||
"You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="inflation rent prices",
|
||||
)
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def add_llm_extraction_strategy(crawler):
|
||||
# Adding an LLM extraction strategy without instructions
|
||||
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
|
||||
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
|
||||
cprint(
|
||||
"\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
# Adding an LLM extraction strategy with instructions
|
||||
cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
|
||||
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
|
||||
cprint(
|
||||
"\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
|
||||
)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="I am interested in only financial news",
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
)
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract only content related to technology",
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def targeted_extraction(crawler):
|
||||
# Using a CSS selector to extract only H2 tags
|
||||
cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
cprint(
|
||||
"\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
|
||||
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def interactive_extraction(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
cprint(
|
||||
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"In this example we try to click the 'Load More' button on the page using JavaScript code."
|
||||
)
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def multiple_scrip(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
js_code = ["""
|
||||
cprint(
|
||||
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
cprint(
|
||||
"In this example we try to click the 'Load More' button on the page using JavaScript code."
|
||||
)
|
||||
js_code = [
|
||||
"""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""] * 2
|
||||
"""
|
||||
] * 2
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
|
||||
cprint(
|
||||
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
def using_crawler_hooks(crawler):
|
||||
# Example usage of the hooks for authentication and setting a cookie
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
driver.maximize_window()
|
||||
|
||||
|
||||
# Example customization: logging in to a hypothetical website
|
||||
driver.get('https://example.com/login')
|
||||
|
||||
driver.get("https://example.com/login")
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.NAME, 'username'))
|
||||
EC.presence_of_element_located((By.NAME, "username"))
|
||||
)
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
driver.find_element(By.NAME, "username").send_keys("testuser")
|
||||
driver.find_element(By.NAME, "password").send_keys("password123")
|
||||
driver.find_element(By.NAME, "login").click()
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||
EC.presence_of_element_located((By.ID, "welcome"))
|
||||
)
|
||||
# Add a custom cookie
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
|
||||
driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
|
||||
return driver
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
# Example customization: add a custom header
|
||||
# Enable Network domain for sending headers
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
driver.execute_cdp_cmd("Network.enable", {})
|
||||
# Add a custom header
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
driver.execute_cdp_cmd(
|
||||
"Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
|
||||
)
|
||||
return driver
|
||||
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
# Example customization: log the URL
|
||||
@@ -246,48 +327,59 @@ def using_crawler_hooks(crawler):
|
||||
# Example customization: log the HTML
|
||||
print(len(html))
|
||||
return driver
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
|
||||
|
||||
cprint(
|
||||
"\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
|
||||
True,
|
||||
)
|
||||
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
|
||||
crawler_strategy.set_hook("on_driver_created", on_driver_created)
|
||||
crawler_strategy.set_hook("before_get_url", before_get_url)
|
||||
crawler_strategy.set_hook("after_get_url", after_get_url)
|
||||
crawler_strategy.set_hook("before_return_html", before_return_html)
|
||||
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
crawler.warmup()
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result= result)
|
||||
|
||||
print_result(result=result)
|
||||
|
||||
|
||||
def using_crawler_hooks_dleay_example(crawler):
|
||||
def delay(driver):
|
||||
print("Delaying for 5 seconds...")
|
||||
time.sleep(5)
|
||||
print("Resuming...")
|
||||
|
||||
|
||||
def create_crawler():
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('after_get_url', delay)
|
||||
crawler_strategy.set_hook("after_get_url", delay)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||
cprint(
|
||||
"\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
|
||||
)
|
||||
crawler = create_crawler()
|
||||
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||
|
||||
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
|
||||
cprint(
|
||||
"🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
|
||||
)
|
||||
cprint(
|
||||
"⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
|
||||
)
|
||||
cprint(
|
||||
"If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
|
||||
)
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
@@ -295,7 +387,7 @@ def main():
|
||||
basic_usage(crawler)
|
||||
# basic_usage_some_params(crawler)
|
||||
understanding_parameters(crawler)
|
||||
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
screenshot_usage(crawler)
|
||||
add_chunking_strategy(crawler)
|
||||
@@ -305,8 +397,10 @@ def main():
|
||||
interactive_extraction(crawler)
|
||||
multiple_scrip(crawler)
|
||||
|
||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||
cprint(
|
||||
"\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -11,7 +11,9 @@ from groq import Groq
|
||||
# Import threadpools to run the crawl_url function in a separate thread
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||
client = AsyncOpenAI(
|
||||
base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")
|
||||
)
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
@@ -25,41 +27,39 @@ settings = {
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
url_pattern = re.compile(r"(https?://\S+)")
|
||||
return url_pattern.findall(text)
|
||||
|
||||
|
||||
def crawl_url(url):
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "NoExtractionStrategy",
|
||||
"chunking_strategy": "RegexChunking"
|
||||
"chunking_strategy": "RegexChunking",
|
||||
}
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
response_data = response.json()
|
||||
response_data = response_data['results'][0]
|
||||
return response_data['markdown']
|
||||
response_data = response_data["results"][0]
|
||||
return response_data["markdown"]
|
||||
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": {}
|
||||
})
|
||||
await cl.Message(
|
||||
content="Welcome to the chat! How can I assist you today?"
|
||||
).send()
|
||||
cl.user_session.set("session", {"history": [], "context": {}})
|
||||
await cl.Message(content="Welcome to the chat! How can I assist you today?").send()
|
||||
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
user_session = cl.user_session.get("session")
|
||||
|
||||
|
||||
# Extract URLs from the user's message
|
||||
urls = extract_urls(message.content)
|
||||
|
||||
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for url in urls:
|
||||
@@ -69,16 +69,9 @@ async def on_message(message: cl.Message):
|
||||
|
||||
for url, result in zip(urls, results):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": result
|
||||
}
|
||||
user_session["context"][ref_number] = {"url": url, "content": result}
|
||||
|
||||
|
||||
user_session["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
user_session["history"].append({"role": "user", "content": message.content})
|
||||
|
||||
# Create a system message that includes the context
|
||||
context_messages = [
|
||||
@@ -95,26 +88,17 @@ async def on_message(message: cl.Message):
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
)
|
||||
),
|
||||
}
|
||||
else:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}
|
||||
|
||||
system_message = {"role": "system", "content": "You are a helpful assistant."}
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
# Get response from the LLM
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
system_message,
|
||||
*user_session["history"]
|
||||
],
|
||||
stream=True,
|
||||
**settings
|
||||
messages=[system_message, *user_session["history"]], stream=True, **settings
|
||||
)
|
||||
|
||||
assistant_response = ""
|
||||
@@ -124,10 +108,7 @@ async def on_message(message: cl.Message):
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assistant message to the history
|
||||
user_session["history"].append({
|
||||
"role": "assistant",
|
||||
"content": assistant_response
|
||||
})
|
||||
user_session["history"].append({"role": "assistant", "content": assistant_response})
|
||||
await msg.update()
|
||||
|
||||
# Append the reference section to the assistant's response
|
||||
@@ -154,10 +135,11 @@ async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@cl.step(type="tool")
|
||||
async def speech_to_text(audio_file):
|
||||
cli = Groq()
|
||||
|
||||
|
||||
response = await client.audio.transcriptions.create(
|
||||
model="whisper-large-v3", file=audio_file
|
||||
)
|
||||
@@ -172,24 +154,19 @@ async def on_audio_end(elements: list[ElementBased]):
|
||||
audio_buffer.seek(0) # Move the file pointer to the beginning
|
||||
audio_file = audio_buffer.read()
|
||||
audio_mime_type: str = cl.user_session.get("audio_mime_type")
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
|
||||
transcription = await speech_to_text(whisper_input)
|
||||
end_time = time.time()
|
||||
print(f"Transcription took {end_time - start_time} seconds")
|
||||
|
||||
user_msg = cl.Message(
|
||||
author="You",
|
||||
type="user_message",
|
||||
content=transcription
|
||||
)
|
||||
|
||||
user_msg = cl.Message(author="You", type="user_message", content=transcription)
|
||||
await user_msg.send()
|
||||
await on_message(user_msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
|
||||
run_chainlit(__file__)
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import requests, base64, os
|
||||
|
||||
data = {
|
||||
@@ -6,59 +5,50 @@ data = {
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
result = response.json()['results'][0]
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
result = response.json()["results"][0]
|
||||
print(result.keys())
|
||||
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||
# 'metadata', 'error_message'])
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result['screenshot']))
|
||||
|
||||
f.write(base64.b64decode(result["screenshot"]))
|
||||
|
||||
# Example of filtering the content using CSS selectors
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"css_selector": "article",
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
# Example of executing a JS script on the page before extracting the content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"screenshot": True,
|
||||
'js' : ["""
|
||||
"js": [
|
||||
"""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||
find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
"""
|
||||
],
|
||||
}
|
||||
|
||||
# Example of using a custom extraction strategy
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"semantic_filter": "inflation rent prices"
|
||||
},
|
||||
"extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
|
||||
}
|
||||
|
||||
# Example of using LLM to extract content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"extraction_strategy": "LLMExtractionStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"provider": "groq/llama3-8b-8192",
|
||||
"api_token": os.environ.get("GROQ_API_KEY"),
|
||||
"instruction": """I am interested in only financial news,
|
||||
and translate them in French."""
|
||||
and translate them in French.""",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -5,42 +5,47 @@ import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
# Create tmp directory if it doesn't exist
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
parent_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
tmp_dir = os.path.join(parent_dir, "tmp")
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
|
||||
|
||||
async def main():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
|
||||
cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=config
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
print(f"Valid until: {cert.valid_until}")
|
||||
print(f"Fingerprint: {cert.fingerprint}")
|
||||
|
||||
|
||||
# 2. Export certificate in different formats
|
||||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||||
print("\nCertificate exported to:")
|
||||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||||
|
||||
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
|
||||
|
||||
pem_data = cert.to_pem(
|
||||
os.path.join(tmp_dir, "certificate.pem")
|
||||
) # For web servers
|
||||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||||
|
||||
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
|
||||
|
||||
der_data = cert.to_der(
|
||||
os.path.join(tmp_dir, "certificate.der")
|
||||
) # For Java apps
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,39 +1,41 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
|
||||
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class PageSummary(BaseModel):
|
||||
title: str = Field(..., description="Title of the page.")
|
||||
summary: str = Field(..., description="Summary of the page.")
|
||||
brief_summary: str = Field(..., description="Brief summary of the page.")
|
||||
keywords: list = Field(..., description="Keywords assigned to the page.")
|
||||
|
||||
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
schema=PageSummary.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
apply_chunking =False,
|
||||
instruction="From the crawled content, extract the following details: "\
|
||||
"1. Title of the page "\
|
||||
"2. Summary of the page, which is a detailed summary "\
|
||||
"3. Brief summary of the page, which is a paragraph text "\
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "\
|
||||
'The extracted JSON format should look like this: '\
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||
apply_chunking=False,
|
||||
instruction="From the crawled content, extract the following details: "
|
||||
"1. Title of the page "
|
||||
"2. Summary of the page, which is a detailed summary "
|
||||
"3. Brief summary of the page, which is a paragraph text "
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "
|
||||
"The extracted JSON format should look like this: "
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os, sys
|
||||
|
||||
# append the parent directory to the sys.path
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
@@ -13,19 +14,18 @@ import json
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
|
||||
# 1. File Download Processing Example
|
||||
async def download_example():
|
||||
"""Example of downloading files from Python.org"""
|
||||
# downloads_path = os.path.join(os.getcwd(), "downloads")
|
||||
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
|
||||
|
||||
print(f"Downloads will be saved to: {downloads_path}")
|
||||
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
accept_downloads=True,
|
||||
downloads_path=downloads_path,
|
||||
verbose=True
|
||||
accept_downloads=True, downloads_path=downloads_path, verbose=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.python.org/downloads/",
|
||||
@@ -40,9 +40,9 @@ async def download_example():
|
||||
}
|
||||
""",
|
||||
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
|
||||
if result.downloaded_files:
|
||||
print("\nDownload successful!")
|
||||
print("Downloaded files:")
|
||||
@@ -52,25 +52,26 @@ async def download_example():
|
||||
else:
|
||||
print("\nNo files were downloaded")
|
||||
|
||||
|
||||
# 2. Local File and Raw HTML Processing Example
|
||||
async def local_and_raw_html_example():
|
||||
"""Example of processing local files and raw HTML"""
|
||||
# Create a sample HTML file
|
||||
sample_file = os.path.join(__data__, "sample.html")
|
||||
with open(sample_file, "w") as f:
|
||||
f.write("""
|
||||
f.write(
|
||||
"""
|
||||
<html><body>
|
||||
<h1>Test Content</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
</body></html>
|
||||
""")
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Process local file
|
||||
local_result = await crawler.arun(
|
||||
url=f"file://{os.path.abspath(sample_file)}"
|
||||
)
|
||||
|
||||
local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
|
||||
|
||||
# Process raw HTML
|
||||
raw_html = """
|
||||
<html><body>
|
||||
@@ -78,16 +79,15 @@ async def local_and_raw_html_example():
|
||||
<p>This is a test of raw HTML processing.</p>
|
||||
</body></html>
|
||||
"""
|
||||
raw_result = await crawler.arun(
|
||||
url=f"raw:{raw_html}"
|
||||
)
|
||||
|
||||
raw_result = await crawler.arun(url=f"raw:{raw_html}")
|
||||
|
||||
# Clean up
|
||||
os.remove(sample_file)
|
||||
|
||||
|
||||
print("Local file content:", local_result.markdown)
|
||||
print("\nRaw HTML content:", raw_result.markdown)
|
||||
|
||||
|
||||
# 3. Enhanced Markdown Generation Example
|
||||
async def markdown_generation_example():
|
||||
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
||||
@@ -97,58 +97,66 @@ async def markdown_generation_example():
|
||||
# user_query="History and cultivation",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=content_filter,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=BM25ContentFilter()
|
||||
content_filter=BM25ContentFilter(),
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
|
||||
|
||||
print("\nMarkdown Generation Results:")
|
||||
print(f"1. Original markdown length: {len(result.markdown)}")
|
||||
print(f"2. New markdown versions (markdown_v2):")
|
||||
print("2. New markdown versions (markdown_v2):")
|
||||
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
|
||||
print(
|
||||
f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
|
||||
)
|
||||
print(
|
||||
f" - References section length: {len(result.markdown_v2.references_markdown)}"
|
||||
)
|
||||
if result.markdown_v2.fit_markdown:
|
||||
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
|
||||
|
||||
print(
|
||||
f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
|
||||
)
|
||||
|
||||
# Save examples to files
|
||||
output_dir = os.path.join(__data__, "markdown_examples")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
|
||||
# Save different versions
|
||||
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
|
||||
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
|
||||
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
||||
f.write(result.markdown_v2.references_markdown)
|
||||
|
||||
|
||||
if result.markdown_v2.fit_markdown:
|
||||
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
|
||||
print(f"\nMarkdown examples saved to: {output_dir}")
|
||||
|
||||
|
||||
# Show a sample of citations and references
|
||||
print("\nSample of markdown with citations:")
|
||||
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
||||
print("Sample of references:")
|
||||
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
|
||||
print(
|
||||
"\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
|
||||
)
|
||||
|
||||
|
||||
# 4. Browser Management Example
|
||||
async def browser_management_example():
|
||||
@@ -156,38 +164,38 @@ async def browser_management_example():
|
||||
# Use the specified user directory path
|
||||
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
|
||||
print(f"Browser profile will be saved to: {user_data_dir}")
|
||||
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
use_managed_browser=True,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=False,
|
||||
verbose=True
|
||||
verbose=True,
|
||||
) as crawler:
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
# Use GitHub as an example - it's a good test for browser management
|
||||
# because it requires proper browser handling
|
||||
result = await crawler.arun(
|
||||
url="https://github.com/trending",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
|
||||
print("\nBrowser session result:", result.success)
|
||||
if result.success:
|
||||
print("Page title:", result.metadata.get('title', 'No title found'))
|
||||
print("Page title:", result.metadata.get("title", "No title found"))
|
||||
|
||||
|
||||
# 5. API Usage Example
|
||||
async def api_example():
|
||||
"""Example of using the new API endpoints"""
|
||||
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
|
||||
headers = {'Authorization': f'Bearer {api_token}'}
|
||||
api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
||||
headers = {"Authorization": f"Bearer {api_token}"}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Submit crawl job
|
||||
crawl_request = {
|
||||
@@ -199,25 +207,17 @@ async def api_example():
|
||||
"name": "Hacker News Articles",
|
||||
"baseSelector": ".athing",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": ".title a",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "score",
|
||||
"selector": ".score",
|
||||
"type": "text"
|
||||
},
|
||||
{"name": "title", "selector": ".title a", "type": "text"},
|
||||
{"name": "score", "selector": ".score", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": ".title a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
}
|
||||
]
|
||||
"attribute": "href",
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
@@ -227,51 +227,50 @@ async def api_example():
|
||||
# "screenshot": True,
|
||||
# "magic": True
|
||||
}
|
||||
|
||||
|
||||
async with session.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json=crawl_request,
|
||||
headers=headers
|
||||
"http://localhost:11235/crawl", json=crawl_request, headers=headers
|
||||
) as response:
|
||||
task_data = await response.json()
|
||||
task_id = task_data["task_id"]
|
||||
|
||||
|
||||
# Check task status
|
||||
while True:
|
||||
async with session.get(
|
||||
f"http://localhost:11235/task/{task_id}",
|
||||
headers=headers
|
||||
f"http://localhost:11235/task/{task_id}", headers=headers
|
||||
) as status_response:
|
||||
result = await status_response.json()
|
||||
print(f"Task status: {result['status']}")
|
||||
|
||||
|
||||
if result["status"] == "completed":
|
||||
print("Task completed!")
|
||||
print("Results:")
|
||||
news = json.loads(result["results"][0]['extracted_content'])
|
||||
news = json.loads(result["results"][0]["extracted_content"])
|
||||
print(json.dumps(news[:4], indent=2))
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# print("Running Crawl4AI feature examples...")
|
||||
|
||||
|
||||
# print("\n1. Running Download Example:")
|
||||
# await download_example()
|
||||
|
||||
|
||||
# print("\n2. Running Markdown Generation Example:")
|
||||
# await markdown_generation_example()
|
||||
|
||||
|
||||
# # print("\n3. Running Local and Raw HTML Example:")
|
||||
# await local_and_raw_html_example()
|
||||
|
||||
|
||||
# # print("\n4. Running Browser Management Example:")
|
||||
await browser_management_example()
|
||||
|
||||
|
||||
# print("\n5. Running API Example:")
|
||||
await api_example()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -10,18 +10,17 @@ import asyncio
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from typing import List, Optional, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy
|
||||
JsonCssExtractionStrategy,
|
||||
)
|
||||
from crawl4ai.content_filter_strategy import RelevantContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Sample HTML for demonstrations
|
||||
@@ -52,17 +51,18 @@ SAMPLE_HTML = """
|
||||
</div>
|
||||
"""
|
||||
|
||||
|
||||
async def demo_ssl_features():
|
||||
"""
|
||||
Enhanced SSL & Security Features Demo
|
||||
-----------------------------------
|
||||
|
||||
|
||||
This example demonstrates the new SSL certificate handling and security features:
|
||||
1. Custom certificate paths
|
||||
2. SSL verification options
|
||||
3. HTTPS error handling
|
||||
4. Certificate validation configurations
|
||||
|
||||
|
||||
These features are particularly useful when:
|
||||
- Working with self-signed certificates
|
||||
- Dealing with corporate proxies
|
||||
@@ -76,14 +76,11 @@ async def demo_ssl_features():
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
fetch_ssl_certificate=True # Enable SSL certificate fetching
|
||||
fetch_ssl_certificate=True, # Enable SSL certificate fetching
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
print(f"SSL Crawl Success: {result.success}")
|
||||
result.ssl_certificate.to_json(
|
||||
os.path.join(os.getcwd(), "ssl_certificate.json")
|
||||
@@ -91,11 +88,12 @@ async def demo_ssl_features():
|
||||
if not result.success:
|
||||
print(f"SSL Error: {result.error_message}")
|
||||
|
||||
|
||||
async def demo_content_filtering():
|
||||
"""
|
||||
Smart Content Filtering Demo
|
||||
----------------------
|
||||
|
||||
|
||||
Demonstrates advanced content filtering capabilities:
|
||||
1. Custom filter to identify and extract specific content
|
||||
2. Integration with markdown generation
|
||||
@@ -110,87 +108,90 @@ async def demo_content_filtering():
|
||||
super().__init__()
|
||||
# Add news-specific patterns
|
||||
self.negative_patterns = re.compile(
|
||||
r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
|
||||
re.I
|
||||
r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
|
||||
re.I,
|
||||
)
|
||||
self.min_word_count = 30 # Higher threshold for news content
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
def filter_content(
|
||||
self, html: str, min_word_threshold: int = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Implements news-specific content filtering logic.
|
||||
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered
|
||||
min_word_threshold (int, optional): Minimum word count threshold
|
||||
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered HTML content blocks
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
if not soup.body:
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
|
||||
body = soup.find('body')
|
||||
|
||||
soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
|
||||
|
||||
body = soup.find("body")
|
||||
|
||||
# Extract chunks with metadata
|
||||
chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
|
||||
|
||||
chunks = self.extract_text_chunks(
|
||||
body, min_word_threshold or self.min_word_count
|
||||
)
|
||||
|
||||
# Filter chunks based on news-specific criteria
|
||||
filtered_chunks = []
|
||||
for _, text, tag_type, element in chunks:
|
||||
# Skip if element has negative class/id
|
||||
if self.is_excluded(element):
|
||||
continue
|
||||
|
||||
|
||||
# Headers are important in news articles
|
||||
if tag_type == 'header':
|
||||
if tag_type == "header":
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
continue
|
||||
|
||||
|
||||
# For content, check word count and link density
|
||||
text = element.get_text(strip=True)
|
||||
if len(text.split()) >= (min_word_threshold or self.min_word_count):
|
||||
# Calculate link density
|
||||
links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
|
||||
links_text = " ".join(
|
||||
a.get_text(strip=True) for a in element.find_all("a")
|
||||
)
|
||||
link_density = len(links_text) / len(text) if text else 1
|
||||
|
||||
|
||||
# Accept if link density is reasonable
|
||||
if link_density < 0.5:
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
|
||||
|
||||
return filtered_chunks
|
||||
|
||||
# Create markdown generator with custom filter
|
||||
markdown_gen = DefaultMarkdownGenerator(
|
||||
content_filter=CustomNewsFilter()
|
||||
)
|
||||
markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_gen,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=run_config
|
||||
url="https://news.ycombinator.com", config=run_config
|
||||
)
|
||||
print("Filtered Content Sample:")
|
||||
print(result.markdown[:500]) # Show first 500 chars
|
||||
|
||||
|
||||
async def demo_json_extraction():
|
||||
"""
|
||||
Improved JSON Extraction Demo
|
||||
---------------------------
|
||||
|
||||
|
||||
Demonstrates the enhanced JSON extraction capabilities:
|
||||
1. Base element attributes extraction
|
||||
2. Complex nested structures
|
||||
3. Multiple extraction patterns
|
||||
|
||||
|
||||
Key features shown:
|
||||
- Extracting attributes from base elements (href, data-* attributes)
|
||||
- Processing repeated patterns
|
||||
@@ -206,7 +207,7 @@ async def demo_json_extraction():
|
||||
"baseSelector": "div.article-list",
|
||||
"baseFields": [
|
||||
{"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
|
||||
{"name": "category", "type": "attribute", "attribute": "data-category"}
|
||||
{"name": "category", "type": "attribute", "attribute": "data-category"},
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
@@ -214,8 +215,16 @@ async def demo_json_extraction():
|
||||
"selector": "article.post",
|
||||
"type": "nested_list",
|
||||
"baseFields": [
|
||||
{"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
|
||||
{"name": "author_id", "type": "attribute", "attribute": "data-author"}
|
||||
{
|
||||
"name": "post_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-post-id",
|
||||
},
|
||||
{
|
||||
"name": "author_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-author",
|
||||
},
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
@@ -223,60 +232,68 @@ async def demo_json_extraction():
|
||||
"selector": "h2.title a",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{"name": "url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
{
|
||||
"name": "url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "author",
|
||||
"selector": "div.meta a.author",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{"name": "profile_url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "date",
|
||||
"selector": "span.date",
|
||||
"type": "text"
|
||||
{
|
||||
"name": "profile_url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
}
|
||||
],
|
||||
},
|
||||
{"name": "date", "selector": "span.date", "type": "text"},
|
||||
{
|
||||
"name": "read_more",
|
||||
"selector": "a.read-more",
|
||||
"type": "nested",
|
||||
"fields": [
|
||||
{"name": "text", "type": "text"},
|
||||
{"name": "url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
]
|
||||
{
|
||||
"name": "url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# Demonstrate extraction from raw HTML
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=json_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
|
||||
config=run_config
|
||||
config=run_config,
|
||||
)
|
||||
print("Extracted Content:")
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
async def demo_input_formats():
|
||||
"""
|
||||
Input Format Handling Demo
|
||||
----------------------
|
||||
|
||||
|
||||
Demonstrates how LLM extraction can work with different input formats:
|
||||
1. Markdown (default) - Good for simple text extraction
|
||||
2. HTML - Better when you need structure and attributes
|
||||
|
||||
|
||||
This example shows how HTML input can be beneficial when:
|
||||
- You need to understand the DOM structure
|
||||
- You want to extract both visible text and HTML attributes
|
||||
@@ -350,7 +367,7 @@ async def demo_input_formats():
|
||||
</footer>
|
||||
</div>
|
||||
"""
|
||||
|
||||
|
||||
# Use raw:// prefix to pass HTML content directly
|
||||
url = f"raw://{dummy_html}"
|
||||
|
||||
@@ -359,18 +376,30 @@ async def demo_input_formats():
|
||||
|
||||
# Define our schema using Pydantic
|
||||
class JobRequirement(BaseModel):
|
||||
category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
|
||||
items: List[str] = Field(description="List of specific requirements in this category")
|
||||
priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
|
||||
category: str = Field(
|
||||
description="Category of the requirement (e.g., Technical, Soft Skills)"
|
||||
)
|
||||
items: List[str] = Field(
|
||||
description="List of specific requirements in this category"
|
||||
)
|
||||
priority: str = Field(
|
||||
description="Priority level (Required/Preferred) based on the HTML class or context"
|
||||
)
|
||||
|
||||
class JobPosting(BaseModel):
|
||||
title: str = Field(description="Job title")
|
||||
department: str = Field(description="Department or team")
|
||||
location: str = Field(description="Job location, including remote options")
|
||||
salary_range: Optional[str] = Field(description="Salary range if specified")
|
||||
requirements: List[JobRequirement] = Field(description="Categorized job requirements")
|
||||
application_deadline: Optional[str] = Field(description="Application deadline if specified")
|
||||
contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
|
||||
requirements: List[JobRequirement] = Field(
|
||||
description="Categorized job requirements"
|
||||
)
|
||||
application_deadline: Optional[str] = Field(
|
||||
description="Application deadline if specified"
|
||||
)
|
||||
contact_info: Optional[dict] = Field(
|
||||
description="Contact information from footer or contact section"
|
||||
)
|
||||
|
||||
# First try with markdown (default)
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
@@ -382,7 +411,7 @@ async def demo_input_formats():
|
||||
Extract job posting details into structured data. Focus on the visible text content
|
||||
and organize requirements into categories.
|
||||
""",
|
||||
input_format="markdown" # default
|
||||
input_format="markdown", # default
|
||||
)
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
@@ -400,34 +429,25 @@ async def demo_input_formats():
|
||||
|
||||
Use HTML attributes and classes to enhance extraction accuracy.
|
||||
""",
|
||||
input_format="html" # explicitly use HTML
|
||||
input_format="html", # explicitly use HTML
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Try with markdown first
|
||||
markdown_config = CrawlerRunConfig(
|
||||
extraction_strategy=markdown_strategy
|
||||
)
|
||||
markdown_result = await crawler.arun(
|
||||
url=url,
|
||||
config=markdown_config
|
||||
)
|
||||
markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
|
||||
markdown_result = await crawler.arun(url=url, config=markdown_config)
|
||||
print("\nMarkdown-based Extraction Result:")
|
||||
items = json.loads(markdown_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
html_config = CrawlerRunConfig(
|
||||
extraction_strategy=html_strategy
|
||||
)
|
||||
html_result = await crawler.arun(
|
||||
url=url,
|
||||
config=html_config
|
||||
)
|
||||
html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
|
||||
html_result = await crawler.arun(url=url, config=html_config)
|
||||
print("\nHTML-based Extraction Result:")
|
||||
items = json.loads(html_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
print("Crawl4AI v0.4.24 Feature Walkthrough")
|
||||
@@ -439,5 +459,6 @@ async def main():
|
||||
await demo_json_extraction()
|
||||
# await demo_input_formats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user