Update all documentation to import extraction strategies directly from crawl4ai.
This commit is contained in:
@@ -352,7 +352,7 @@ if __name__ == "__main__":
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -426,7 +426,7 @@ if __name__ == "__main__":
|
|||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.hub import BaseCrawler
|
from crawl4ai.hub import BaseCrawler
|
||||||
from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
|
from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field
|
|||||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai import (
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
)
|
)
|
||||||
@@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2():
|
|||||||
|
|
||||||
|
|
||||||
async def cosine_similarity_extraction():
|
async def cosine_similarity_extraction():
|
||||||
from crawl4ai.extraction_strategy import CosineStrategy
|
from crawl4ai import CosineStrategy
|
||||||
crawl_config = CrawlerRunConfig(
|
crawl_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
|
|||||||
@@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# 1) Browser config: headless, bigger viewport, no proxy
|
# 1) Browser config: headless, bigger viewport, no proxy
|
||||||
@@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Minimal schema for repeated items
|
# Minimal schema for repeated items
|
||||||
@@ -1094,7 +1094,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class ArticleData(BaseModel):
|
class ArticleData(BaseModel):
|
||||||
headline: str
|
headline: str
|
||||||
@@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_main_articles(url: str):
|
async def extract_main_articles(url: str):
|
||||||
schema = {
|
schema = {
|
||||||
@@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
schema = {
|
schema = {
|
||||||
@@ -4722,7 +4722,7 @@ if __name__ == "__main__":
|
|||||||
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
|
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
schema = {
|
schema = {
|
||||||
"name": "Commits",
|
"name": "Commits",
|
||||||
@@ -4902,7 +4902,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
|||||||
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
|
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
# Generate a schema (one-time cost)
|
# Generate a schema (one-time cost)
|
||||||
@@ -4932,7 +4932,7 @@ Here's a basic extraction example:
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
schema = {
|
schema = {
|
||||||
@@ -4987,7 +4987,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||||
@@ -5103,7 +5103,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_structured_data_using_css_extractor():
|
async def extract_structured_data_using_css_extractor():
|
||||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||||
@@ -7300,7 +7300,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.cache_context import CacheMode
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
|
||||||
async def crawl_dynamic_content():
|
async def crawl_dynamic_content():
|
||||||
@@ -7850,7 +7850,7 @@ The Cosine Strategy:
|
|||||||
## Basic Usage
|
## Basic Usage
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import CosineStrategy
|
from crawl4ai import CosineStrategy
|
||||||
|
|
||||||
strategy = CosineStrategy(
|
strategy = CosineStrategy(
|
||||||
semantic_filter="product reviews", # Target content type
|
semantic_filter="product reviews", # Target content type
|
||||||
@@ -8161,7 +8161,7 @@ import json
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class Product(BaseModel):
|
class Product(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
@@ -8278,7 +8278,7 @@ import asyncio
|
|||||||
from typing import List
|
from typing import List
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class Entity(BaseModel):
|
class Entity(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
@@ -8423,7 +8423,7 @@ Let’s begin with a **simple** schema-based extraction using the `JsonCssExtrac
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_crypto_prices():
|
async def extract_crypto_prices():
|
||||||
# 1. Define a simple extraction schema
|
# 1. Define a simple extraction schema
|
||||||
@@ -8493,7 +8493,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
async def extract_crypto_prices_xpath():
|
async def extract_crypto_prices_xpath():
|
||||||
# 1. Minimal dummy HTML with some repeating rows
|
# 1. Minimal dummy HTML with some repeating rows
|
||||||
@@ -8694,7 +8694,7 @@ Key Takeaways:
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
ecommerce_schema = {
|
ecommerce_schema = {
|
||||||
# ... the advanced schema from above ...
|
# ... the advanced schema from above ...
|
||||||
@@ -8804,7 +8804,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
|
|||||||
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
|
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
# Sample HTML with product information
|
# Sample HTML with product information
|
||||||
|
|||||||
7715
docs/apps/iseeyou/llms-full.txt
Normal file
7715
docs/apps/iseeyou/llms-full.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
import json
|
import json
|
||||||
from playwright.async_api import Page, BrowserContext
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ from pathlib import Path
|
|||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.script.c4a_compile import C4ACompiler
|
from crawl4ai.script.c4a_compile import C4ACompiler
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ from pathlib import Path
|
|||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.script.c4a_compile import C4ACompiler
|
from crawl4ai.script.c4a_compile import C4ACompiler
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import os
|
|||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai import (
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
JsonXPathExtractionStrategy,
|
JsonXPathExtractionStrategy,
|
||||||
|
|||||||
@@ -518,7 +518,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
|
"from crawl4ai import LLMExtractionStrategy\n",
|
||||||
"from pydantic import BaseModel, Field\n",
|
"from pydantic import BaseModel, Field\n",
|
||||||
"import os, json\n",
|
"import os, json\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -594,7 +594,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from crawl4ai.extraction_strategy import CosineStrategy\n",
|
"from crawl4ai import CosineStrategy\n",
|
||||||
"\n",
|
"\n",
|
||||||
"async def cosine_similarity_extraction():\n",
|
"async def cosine_similarity_extraction():\n",
|
||||||
" async with AsyncWebCrawler() as crawler:\n",
|
" async with AsyncWebCrawler() as crawler:\n",
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
|
|||||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai import (
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
)
|
)
|
||||||
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
|
|||||||
|
|
||||||
|
|
||||||
async def cosine_similarity_extraction():
|
async def cosine_similarity_extraction():
|
||||||
from crawl4ai.extraction_strategy import CosineStrategy
|
from crawl4ai import CosineStrategy
|
||||||
crawl_config = CrawlerRunConfig(
|
crawl_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
|
|||||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai import (
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
)
|
)
|
||||||
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
|
|||||||
|
|
||||||
|
|
||||||
async def cosine_similarity_extraction():
|
async def cosine_similarity_extraction():
|
||||||
from crawl4ai.extraction_strategy import CosineStrategy
|
from crawl4ai import CosineStrategy
|
||||||
crawl_config = CrawlerRunConfig(
|
crawl_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.chunking_strategy import *
|
from crawl4ai.chunking_strategy import *
|
||||||
from crawl4ai.extraction_strategy import *
|
from crawl4ai import *
|
||||||
from crawl4ai.crawler_strategy import *
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
|
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from crawl4ai import RoundRobinProxyStrategy
|
|||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
from crawl4ai import DefaultMarkdownGenerator
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.cache_context import CacheMode
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
|
||||||
async def crawl_dynamic_content():
|
async def crawl_dynamic_content():
|
||||||
|
|||||||
@@ -215,7 +215,7 @@ Below is a snippet combining many parameters:
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Example schema
|
# Example schema
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ Below is an example hooking it all together:
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ OverlappingWindowChunking(
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
# Define schema
|
# Define schema
|
||||||
@@ -247,7 +247,7 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
### CSS Extraction
|
### CSS Extraction
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Define schema
|
# Define schema
|
||||||
schema = {
|
schema = {
|
||||||
|
|||||||
@@ -1701,7 +1701,7 @@ Generated: ${new Date().toISOString()}
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# The extraction schema generated from your selections
|
# The extraction schema generated from your selections
|
||||||
EXTRACTION_SCHEMA = ${schemaJson}
|
EXTRACTION_SCHEMA = ${schemaJson}
|
||||||
@@ -1782,7 +1782,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# HTML snippet of the selected container element
|
# HTML snippet of the selected container element
|
||||||
HTML_SNIPPET = """
|
HTML_SNIPPET = """
|
||||||
|
|||||||
@@ -2437,7 +2437,7 @@ Generated: ${timestamp}
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# C4A Script commands
|
# C4A Script commands
|
||||||
C4A_SCRIPT = """
|
C4A_SCRIPT = """
|
||||||
|
|||||||
@@ -476,7 +476,7 @@ services:
|
|||||||
```python
|
```python
|
||||||
# Method 1: Create config objects and dump to see expected JSON structure
|
# Method 1: Create config objects and dump to see expected JSON structure
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Create browser config and see JSON structure
|
# Create browser config and see JSON structure
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ import json
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class SentimentAnalysis(BaseModel):
|
class SentimentAnalysis(BaseModel):
|
||||||
"""Use LLM when you need semantic understanding"""
|
"""Use LLM when you need semantic understanding"""
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def smart_extraction_workflow():
|
async def smart_extraction_workflow():
|
||||||
"""
|
"""
|
||||||
@@ -176,7 +176,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Manual schema for consistent product pages
|
# Manual schema for consistent product pages
|
||||||
simple_schema = {
|
simple_schema = {
|
||||||
@@ -342,7 +342,7 @@ asyncio.run(extract_complex_ecommerce())
|
|||||||
### XPath Alternative (When CSS Isn't Enough)
|
### XPath Alternative (When CSS Isn't Enough)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
# XPath for more complex selections
|
# XPath for more complex selections
|
||||||
xpath_schema = {
|
xpath_schema = {
|
||||||
@@ -387,7 +387,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
from crawl4ai import RegexExtractionStrategy
|
||||||
|
|
||||||
async def extract_common_patterns():
|
async def extract_common_patterns():
|
||||||
# Use built-in patterns for common data types
|
# Use built-in patterns for common data types
|
||||||
|
|||||||
@@ -1835,7 +1835,7 @@ import json
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class SentimentAnalysis(BaseModel):
|
class SentimentAnalysis(BaseModel):
|
||||||
"""Use LLM when you need semantic understanding"""
|
"""Use LLM when you need semantic understanding"""
|
||||||
@@ -2743,7 +2743,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def smart_extraction_workflow():
|
async def smart_extraction_workflow():
|
||||||
"""
|
"""
|
||||||
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Manual schema for consistent product pages
|
# Manual schema for consistent product pages
|
||||||
simple_schema = {
|
simple_schema = {
|
||||||
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
|
|||||||
### XPath Alternative (When CSS Isn't Enough)
|
### XPath Alternative (When CSS Isn't Enough)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
# XPath for more complex selections
|
# XPath for more complex selections
|
||||||
xpath_schema = {
|
xpath_schema = {
|
||||||
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
from crawl4ai import RegexExtractionStrategy
|
||||||
|
|
||||||
async def extract_common_patterns():
|
async def extract_common_patterns():
|
||||||
# Use built-in patterns for common data types
|
# Use built-in patterns for common data types
|
||||||
@@ -4711,7 +4711,7 @@ services:
|
|||||||
```python
|
```python
|
||||||
# Method 1: Create config objects and dump to see expected JSON structure
|
# Method 1: Create config objects and dump to see expected JSON structure
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Create browser config and see JSON structure
|
# Create browser config and see JSON structure
|
||||||
@@ -5792,7 +5792,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def smart_extraction_workflow():
|
async def smart_extraction_workflow():
|
||||||
"""
|
"""
|
||||||
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Manual schema for consistent product pages
|
# Manual schema for consistent product pages
|
||||||
simple_schema = {
|
simple_schema = {
|
||||||
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
|
|||||||
### XPath Alternative (When CSS Isn't Enough)
|
### XPath Alternative (When CSS Isn't Enough)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
# XPath for more complex selections
|
# XPath for more complex selections
|
||||||
xpath_schema = {
|
xpath_schema = {
|
||||||
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
from crawl4ai import RegexExtractionStrategy
|
||||||
|
|
||||||
async def extract_common_patterns():
|
async def extract_common_patterns():
|
||||||
# Use built-in patterns for common data types
|
# Use built-in patterns for common data types
|
||||||
|
|||||||
@@ -1835,7 +1835,7 @@ import json
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class SentimentAnalysis(BaseModel):
|
class SentimentAnalysis(BaseModel):
|
||||||
"""Use LLM when you need semantic understanding"""
|
"""Use LLM when you need semantic understanding"""
|
||||||
@@ -2743,7 +2743,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def smart_extraction_workflow():
|
async def smart_extraction_workflow():
|
||||||
"""
|
"""
|
||||||
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Manual schema for consistent product pages
|
# Manual schema for consistent product pages
|
||||||
simple_schema = {
|
simple_schema = {
|
||||||
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
|
|||||||
### XPath Alternative (When CSS Isn't Enough)
|
### XPath Alternative (When CSS Isn't Enough)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
# XPath for more complex selections
|
# XPath for more complex selections
|
||||||
xpath_schema = {
|
xpath_schema = {
|
||||||
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
from crawl4ai import RegexExtractionStrategy
|
||||||
|
|
||||||
async def extract_common_patterns():
|
async def extract_common_patterns():
|
||||||
# Use built-in patterns for common data types
|
# Use built-in patterns for common data types
|
||||||
@@ -4711,7 +4711,7 @@ services:
|
|||||||
```python
|
```python
|
||||||
# Method 1: Create config objects and dump to see expected JSON structure
|
# Method 1: Create config objects and dump to see expected JSON structure
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Create browser config and see JSON structure
|
# Create browser config and see JSON structure
|
||||||
@@ -5792,7 +5792,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def smart_extraction_workflow():
|
async def smart_extraction_workflow():
|
||||||
"""
|
"""
|
||||||
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
# Manual schema for consistent product pages
|
# Manual schema for consistent product pages
|
||||||
simple_schema = {
|
simple_schema = {
|
||||||
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
|
|||||||
### XPath Alternative (When CSS Isn't Enough)
|
### XPath Alternative (When CSS Isn't Enough)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
# XPath for more complex selections
|
# XPath for more complex selections
|
||||||
xpath_schema = {
|
xpath_schema = {
|
||||||
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
from crawl4ai import RegexExtractionStrategy
|
||||||
|
|
||||||
async def extract_common_patterns():
|
async def extract_common_patterns():
|
||||||
# Use built-in patterns for common data types
|
# Use built-in patterns for common data types
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -334,7 +334,7 @@ asyncio.run(main())
|
|||||||
schemas.
|
schemas.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||||
@@ -402,7 +402,7 @@ print(schema)
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
# Example of using LLMConfig with LLMExtractionStrategy
|
# Example of using LLMConfig with LLMExtractionStrategy
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# 1) Browser config: headless, bigger viewport, no proxy
|
# 1) Browser config: headless, bigger viewport, no proxy
|
||||||
|
|||||||
@@ -191,7 +191,7 @@ You can combine content selection with a more advanced extraction strategy. For
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Minimal schema for repeated items
|
# Minimal schema for repeated items
|
||||||
@@ -243,7 +243,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class ArticleData(BaseModel):
|
class ArticleData(BaseModel):
|
||||||
headline: str
|
headline: str
|
||||||
@@ -288,7 +288,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_main_articles(url: str):
|
async def extract_main_articles(url: str):
|
||||||
schema = {
|
schema = {
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
schema = {
|
schema = {
|
||||||
|
|||||||
@@ -296,7 +296,7 @@ if __name__ == "__main__":
|
|||||||
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
|
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
schema = {
|
schema = {
|
||||||
"name": "Commits",
|
"name": "Commits",
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
|||||||
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
|
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
# Generate a schema (one-time cost)
|
# Generate a schema (one-time cost)
|
||||||
@@ -157,7 +157,7 @@ Here's a basic extraction example:
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
schema = {
|
schema = {
|
||||||
@@ -212,7 +212,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||||
@@ -328,7 +328,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_structured_data_using_css_extractor():
|
async def extract_structured_data_using_css_extractor():
|
||||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ The Cosine Strategy:
|
|||||||
## Basic Usage
|
## Basic Usage
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import CosineStrategy
|
from crawl4ai import CosineStrategy
|
||||||
|
|
||||||
strategy = CosineStrategy(
|
strategy = CosineStrategy(
|
||||||
semantic_filter="product reviews", # Target content type
|
semantic_filter="product reviews", # Target content type
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ import json
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class Product(BaseModel):
|
class Product(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
@@ -219,7 +219,7 @@ import asyncio
|
|||||||
from typing import List
|
from typing import List
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
class Entity(BaseModel):
|
class Entity(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ Let's begin with a **simple** schema-based extraction using the `JsonCssExtracti
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def extract_crypto_prices():
|
async def extract_crypto_prices():
|
||||||
# 1. Define a simple extraction schema
|
# 1. Define a simple extraction schema
|
||||||
@@ -108,7 +108,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
from crawl4ai import JsonXPathExtractionStrategy
|
||||||
|
|
||||||
async def extract_crypto_prices_xpath():
|
async def extract_crypto_prices_xpath():
|
||||||
# 1. Minimal dummy HTML with some repeating rows
|
# 1. Minimal dummy HTML with some repeating rows
|
||||||
@@ -309,7 +309,7 @@ Key Takeaways:
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
ecommerce_schema = {
|
ecommerce_schema = {
|
||||||
# ... the advanced schema from above ...
|
# ... the advanced schema from above ...
|
||||||
@@ -649,7 +649,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
|
|||||||
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
|
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
# Sample HTML with product information
|
# Sample HTML with product information
|
||||||
|
|||||||
@@ -149,7 +149,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
|
"from crawl4ai import LLMExtractionStrategy\n",
|
||||||
"from pydantic import BaseModel\n",
|
"from pydantic import BaseModel\n",
|
||||||
"import json, os\n",
|
"import json, os\n",
|
||||||
"from typing import List\n",
|
"from typing import List\n",
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ dependencies = [
|
|||||||
"aiohttp>=3.11.11",
|
"aiohttp>=3.11.11",
|
||||||
"brotli>=1.1.0",
|
"brotli>=1.1.0",
|
||||||
"humanize>=4.10.0",
|
"humanize>=4.10.0",
|
||||||
|
"lark>=1.2.2"
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import asyncio
|
|||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
from crawl4ai.chunking_strategy import RegexChunking
|
from crawl4ai.chunking_strategy import RegexChunking
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ sys.path.append(parent_dir)
|
|||||||
from crawl4ai import LLMConfig
|
from crawl4ai import LLMConfig
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
from crawl4ai.chunking_strategy import RegexChunking
|
from crawl4ai.chunking_strategy import RegexChunking
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
|
|||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||||
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from crawl4ai.chunking_strategy import (
|
|||||||
FixedLengthWordChunking,
|
FixedLengthWordChunking,
|
||||||
SlidingWindowChunking,
|
SlidingWindowChunking,
|
||||||
)
|
)
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai import (
|
||||||
CosineStrategy,
|
CosineStrategy,
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
TopicExtractionStrategy,
|
TopicExtractionStrategy,
|
||||||
|
|||||||
Reference in New Issue
Block a user