Update Quickstart examples

This commit is contained in:
unclecode
2024-10-13 14:37:45 +08:00
parent 68e9144ce3
commit b9bbd42373

View File

@@ -10,6 +10,7 @@ import time
import json import json
import os import os
import re import re
from typing import Dict
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
@@ -18,6 +19,8 @@ from crawl4ai.extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
) )
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
print("Crawl4AI: Advanced Web Crawling and Data Extraction") print("Crawl4AI: Advanced Web Crawling and Data Extraction")
print("GitHub Repository: https://github.com/unclecode/crawl4ai") print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode") print("Twitter: @unclecode")
@@ -30,7 +33,7 @@ async def simple_crawl():
result = await crawler.arun(url="https://www.nbcnews.com/business") result = await crawler.arun(url="https://www.nbcnews.com/business")
print(result.markdown[:500]) # Print first 500 characters print(result.markdown[:500]) # Print first 500 characters
async def js_and_css(): async def simple_example_with_running_js_code():
print("\n--- Executing JavaScript and Using CSS Selectors ---") print("\n--- Executing JavaScript and Using CSS Selectors ---")
# New code to handle the wait_for parameter # New code to handle the wait_for parameter
wait_for = """() => { wait_for = """() => {
@@ -47,12 +50,21 @@ async def js_and_css():
result = await crawler.arun( result = await crawler.arun(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js_code=js_code, js_code=js_code,
# css_selector="article.tease-card",
# wait_for=wait_for, # wait_for=wait_for,
bypass_cache=True, bypass_cache=True,
) )
print(result.markdown[:500]) # Print first 500 characters print(result.markdown[:500]) # Print first 500 characters
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
css_selector=".wide-tease-item__description",
bypass_cache=True,
)
print(result.markdown[:500]) # Print first 500 characters
async def use_proxy(): async def use_proxy():
print("\n--- Using a Proxy ---") print("\n--- Using a Proxy ---")
print( print(
@@ -66,7 +78,6 @@ async def use_proxy():
# ) # )
# print(result.markdown[:500]) # Print first 500 characters # print(result.markdown[:500]) # Print first 500 characters
async def capture_and_save_screenshot(url: str, output_path: str): async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun( result = await crawler.arun(
@@ -413,8 +424,10 @@ async def speed_comparison():
async def main(): async def main():
await simple_crawl() await simple_crawl()
await js_and_css() await simple_example_with_running_js_code()
await simple_example_with_css_selector()
await use_proxy() await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor() await extract_structured_data_using_css_extractor()
# LLM extraction examples # LLM extraction examples