Compare commits
22 Commits
feature/co
...
0.3.742
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de43505ae4 | ||
|
|
d7c5b900b8 | ||
|
|
edad7b6a74 | ||
|
|
829a1f7992 | ||
|
|
d729aa7d5e | ||
|
|
0d0cef3438 | ||
|
|
d7a112fefe | ||
|
|
a5decaa7cf | ||
|
|
8dea3f470f | ||
|
|
e02935dc5b | ||
|
|
24ad2fe2dd | ||
|
|
571dda6549 | ||
|
|
006bee4a5a | ||
|
|
dbb751c8f0 | ||
|
|
3439f7886d | ||
|
|
d418a04602 | ||
|
|
b654c49e55 | ||
|
|
fbcff85ecb | ||
|
|
788c67c29a | ||
|
|
2f19d38693 | ||
|
|
3aae30ed2a | ||
|
|
593c7ad307 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -211,5 +211,6 @@ git_issues.md
|
||||
.docs/
|
||||
.issues/
|
||||
.gitboss/
|
||||
|
||||
manage-collab.sh
|
||||
todo_executor.md
|
||||
protect-all-except-feature.sh
|
||||
manage-collab.sh
|
||||
|
||||
31
README.md
31
README.md
@@ -13,15 +13,19 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
|
||||
|
||||
## New in 0.3.74 ✨
|
||||
|
||||
- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
|
||||
- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object.
|
||||
- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content.
|
||||
- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
|
||||
- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures.
|
||||
- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter.
|
||||
- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`.
|
||||
- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance.
|
||||
- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing.
|
||||
- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed.
|
||||
- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`.
|
||||
- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats.
|
||||
- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists.
|
||||
- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown.
|
||||
- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats.
|
||||
- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly.
|
||||
- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots.
|
||||
- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching.
|
||||
- 🐳 **API Gateway**: Run as an API service with secure token authentication.
|
||||
- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching.
|
||||
- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling.
|
||||
|
||||
|
||||
## Try it Now!
|
||||
|
||||
@@ -138,6 +142,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version
|
||||
# Run the container
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version
|
||||
|
||||
# In case you want to set platform to arm64
|
||||
docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic
|
||||
|
||||
# In case to allocate more shared memory for the container
|
||||
docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
|
||||
```
|
||||
@@ -154,6 +161,12 @@ docker build -t crawl4ai:local \
|
||||
--build-arg INSTALL_TYPE=basic \ # Options: basic, all
|
||||
.
|
||||
|
||||
# In case you want to set platform to arm64
|
||||
docker build -t crawl4ai:local \
|
||||
--build-arg INSTALL_TYPE=basic \ # Options: basic, all
|
||||
--platform linux/arm64 \
|
||||
.
|
||||
|
||||
# Run your local build
|
||||
docker run -p 11235:11235 crawl4ai:local
|
||||
```
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# __init__.py
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
|
||||
from .models import CrawlResult
|
||||
from .__version__ import __version__
|
||||
# __version__ = "0.3.73"
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.3.74"
|
||||
__version__ = "0.3.742"
|
||||
@@ -229,6 +229,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
self.headless = kwargs.get("headless", True)
|
||||
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||
self.headers = kwargs.get("headers", {})
|
||||
self.cookies = kwargs.get("cookies", [])
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800
|
||||
self.js_code = js_code
|
||||
@@ -295,6 +296,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# Set up the default context
|
||||
if self.default_context:
|
||||
await self.default_context.set_extra_http_headers(self.headers)
|
||||
if self.cookies:
|
||||
await self.default_context.add_cookies(self.cookies)
|
||||
if self.accept_downloads:
|
||||
await self.default_context.set_default_timeout(60000)
|
||||
await self.default_context.set_default_navigation_timeout(60000)
|
||||
@@ -669,6 +672,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# downloads_path=self.downloads_path if self.accept_downloads else None
|
||||
)
|
||||
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||
if self.cookies:
|
||||
await context.add_cookies(self.cookies)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
page = await context.new_page()
|
||||
self.sessions[session_id] = (context, page, time.time())
|
||||
@@ -684,6 +689,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
proxy={"server": self.proxy} if self.proxy else None,
|
||||
accept_downloads=self.accept_downloads,
|
||||
)
|
||||
if self.cookies:
|
||||
await context.add_cookies(self.cookies)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
|
||||
if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||
@@ -828,7 +835,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
for js in js_code:
|
||||
await page.evaluate(js)
|
||||
|
||||
await page.wait_for_load_state('networkidle')
|
||||
# await page.wait_for_timeout(100)
|
||||
|
||||
# Check for on execution event
|
||||
await self.execute_hook('on_execution_started', page)
|
||||
|
||||
@@ -846,6 +854,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
# if not wait_for and js_code:
|
||||
# await page.wait_for_load_state('networkidle', timeout=5000)
|
||||
|
||||
# Update image dimensions
|
||||
update_image_dimensions_js = """
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
from typing import Optional, Tuple, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
import xxhash
|
||||
import aiofiles
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
class AsyncDatabaseManager:
|
||||
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
||||
self.db_path = DB_PATH
|
||||
self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH))
|
||||
self.pool_size = pool_size
|
||||
self.max_retries = max_retries
|
||||
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
|
||||
self.pool_lock = asyncio.Lock()
|
||||
self.connection_semaphore = asyncio.Semaphore(pool_size)
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the database and connection pool"""
|
||||
await self.ainit_db()
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup connections when shutting down"""
|
||||
async with self.pool_lock:
|
||||
for conn in self.connection_pool.values():
|
||||
await conn.close()
|
||||
self.connection_pool.clear()
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_connection(self):
|
||||
"""Connection pool manager"""
|
||||
async with self.connection_semaphore:
|
||||
task_id = id(asyncio.current_task())
|
||||
try:
|
||||
async with self.pool_lock:
|
||||
if task_id not in self.connection_pool:
|
||||
conn = await aiosqlite.connect(
|
||||
self.db_path,
|
||||
timeout=30.0
|
||||
)
|
||||
await conn.execute('PRAGMA journal_mode = WAL')
|
||||
await conn.execute('PRAGMA busy_timeout = 5000')
|
||||
self.connection_pool[task_id] = conn
|
||||
|
||||
yield self.connection_pool[task_id]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Connection error: {e}")
|
||||
raise
|
||||
finally:
|
||||
async with self.pool_lock:
|
||||
if task_id in self.connection_pool:
|
||||
await self.connection_pool[task_id].close()
|
||||
del self.connection_pool[task_id]
|
||||
|
||||
async def execute_with_retry(self, operation, *args):
|
||||
"""Execute database operations with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
async with self.get_connection() as db:
|
||||
result = await operation(db, *args)
|
||||
await db.commit()
|
||||
return result
|
||||
except Exception as e:
|
||||
if attempt == self.max_retries - 1:
|
||||
logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
|
||||
raise
|
||||
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
|
||||
|
||||
async def ainit_db(self):
|
||||
"""Initialize database schema"""
|
||||
async def _init(db):
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||
url TEXT PRIMARY KEY,
|
||||
html TEXT,
|
||||
cleaned_html TEXT,
|
||||
markdown TEXT,
|
||||
extracted_content TEXT,
|
||||
success BOOLEAN,
|
||||
media TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT "",
|
||||
response_headers TEXT DEFAULT "{}",
|
||||
downloaded_files TEXT DEFAULT "{}" -- New column added
|
||||
)
|
||||
''')
|
||||
|
||||
await self.execute_with_retry(_init)
|
||||
await self.update_db_schema()
|
||||
|
||||
async def update_db_schema(self):
|
||||
"""Update database schema if needed"""
|
||||
async def _check_columns(db):
|
||||
cursor = await db.execute("PRAGMA table_info(crawled_data)")
|
||||
columns = await cursor.fetchall()
|
||||
return [column[1] for column in columns]
|
||||
|
||||
column_names = await self.execute_with_retry(_check_columns)
|
||||
|
||||
# List of new columns to add
|
||||
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
|
||||
|
||||
for column in new_columns:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column)
|
||||
|
||||
async def aalter_db_add_column(self, new_column: str):
|
||||
"""Add new column to the database"""
|
||||
async def _alter(db):
|
||||
if new_column == 'response_headers':
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
|
||||
else:
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
logger.info(f"Added column '{new_column}' to the database.")
|
||||
|
||||
await self.execute_with_retry(_alter)
|
||||
|
||||
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]:
|
||||
"""Retrieve cached URL data"""
|
||||
async def _get(db):
|
||||
async with db.execute(
|
||||
'''
|
||||
SELECT url, html, cleaned_html, markdown,
|
||||
extracted_content, success, media, links,
|
||||
metadata, screenshot, response_headers,
|
||||
downloaded_files
|
||||
FROM crawled_data WHERE url = ?
|
||||
''',
|
||||
(url,)
|
||||
) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
# Load content from files using stored hashes
|
||||
html = await self._load_content(row[1], 'html') if row[1] else ""
|
||||
cleaned = await self._load_content(row[2], 'cleaned') if row[2] else ""
|
||||
markdown = await self._load_content(row[3], 'markdown') if row[3] else ""
|
||||
extracted = await self._load_content(row[4], 'extracted') if row[4] else ""
|
||||
screenshot = await self._load_content(row[9], 'screenshots') if row[9] else ""
|
||||
|
||||
return (
|
||||
row[0], # url
|
||||
html or "", # Return empty string if file not found
|
||||
cleaned or "",
|
||||
markdown or "",
|
||||
extracted or "",
|
||||
row[5], # success
|
||||
json.loads(row[6] or '{}'), # media
|
||||
json.loads(row[7] or '{}'), # links
|
||||
json.loads(row[8] or '{}'), # metadata
|
||||
screenshot or "",
|
||||
json.loads(row[10] or '{}'), # response_headers
|
||||
json.loads(row[11] or '[]') # downloaded_files
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_get)
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
async def acache_url(self, url: str, html: str, cleaned_html: str,
|
||||
markdown: str, extracted_content: str, success: bool,
|
||||
media: str = "{}", links: str = "{}",
|
||||
metadata: str = "{}", screenshot: str = "",
|
||||
response_headers: str = "{}", downloaded_files: str = "[]"):
|
||||
"""Cache URL data with content stored in filesystem"""
|
||||
|
||||
# Store content files and get hashes
|
||||
html_hash = await self._store_content(html, 'html')
|
||||
cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
|
||||
markdown_hash = await self._store_content(markdown, 'markdown')
|
||||
extracted_hash = await self._store_content(extracted_content, 'extracted')
|
||||
screenshot_hash = await self._store_content(screenshot, 'screenshots')
|
||||
|
||||
async def _cache(db):
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (
|
||||
url, html, cleaned_html, markdown,
|
||||
extracted_content, success, media, links, metadata,
|
||||
screenshot, response_headers, downloaded_files
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot,
|
||||
response_headers = excluded.response_headers,
|
||||
downloaded_files = excluded.downloaded_files
|
||||
''', (url, html_hash, cleaned_hash, markdown_hash, extracted_hash,
|
||||
success, media, links, metadata, screenshot_hash,
|
||||
response_headers, downloaded_files))
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_cache)
|
||||
except Exception as e:
|
||||
logger.error(f"Error caching URL: {e}")
|
||||
|
||||
|
||||
|
||||
async def aget_total_count(self) -> int:
|
||||
"""Get total number of cached URLs"""
|
||||
async def _count(db):
|
||||
async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_count)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting total count: {e}")
|
||||
return 0
|
||||
|
||||
async def aclear_db(self):
|
||||
"""Clear all data from the database"""
|
||||
async def _clear(db):
|
||||
await db.execute('DELETE FROM crawled_data')
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_clear)
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing database: {e}")
|
||||
|
||||
async def aflush_db(self):
|
||||
"""Drop the entire table"""
|
||||
async def _flush(db):
|
||||
await db.execute('DROP TABLE IF EXISTS crawled_data')
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_flush)
|
||||
except Exception as e:
|
||||
logger.error(f"Error flushing database: {e}")
|
||||
|
||||
|
||||
async def _store_content(self, content: str, content_type: str) -> str:
|
||||
"""Store content in filesystem and return hash"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
content_hash = generate_content_hash(content)
|
||||
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
||||
|
||||
# Only write if file doesn't exist
|
||||
if not os.path.exists(file_path):
|
||||
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
||||
await f.write(content)
|
||||
|
||||
return content_hash
|
||||
|
||||
async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
|
||||
"""Load content from filesystem by hash"""
|
||||
if not content_hash:
|
||||
return None
|
||||
|
||||
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
||||
try:
|
||||
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
|
||||
return await f.read()
|
||||
except:
|
||||
logger.error(f"Failed to load content: {file_path}")
|
||||
return None
|
||||
|
||||
# Create a singleton instance
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
@@ -1,344 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import json
|
||||
import asyncio
|
||||
from .models import CrawlResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
InvalidCSSSelectorError,
|
||||
format_html
|
||||
)
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
class AsyncWebCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
base_directory: str = str(Path.home()),
|
||||
**kwargs,
|
||||
):
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
**kwargs
|
||||
)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
self.ready = False
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.crawler_strategy.__aenter__()
|
||||
await self.awarmup()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
# # await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
|
||||
async def arun(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
"""
|
||||
try:
|
||||
if disable_cache:
|
||||
bypass_cache = True
|
||||
no_cache_read = True
|
||||
no_cache_write = True
|
||||
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
|
||||
is_web_url = url.startswith(('http://', 'https://'))
|
||||
is_local_file = url.startswith("file://")
|
||||
is_raw_html = url.startswith("raw:")
|
||||
_url = url if not is_raw_html else "Raw HTML"
|
||||
|
||||
start_time = time.perf_counter()
|
||||
cached_result = None
|
||||
if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
||||
if screenshot:
|
||||
screenshot_data = cached_result.screenshot
|
||||
if not screenshot_data:
|
||||
cached_result = None
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
if not cached or not html:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
t2 = time.perf_counter()
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
t1 = time.perf_counter()
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
is_web_url = is_web_url,
|
||||
is_local_file = is_local_file,
|
||||
is_raw_html = is_raw_html,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
if not is_raw_html and not no_cache_write:
|
||||
if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
|
||||
return crawl_result
|
||||
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
"""
|
||||
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs with supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: The results of the crawling and processing.
|
||||
"""
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
async with semaphore:
|
||||
return await self.arun(
|
||||
url,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
bypass_cache=bypass_cache,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
user_agent=user_agent,
|
||||
verbose=verbose,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
word_count_threshold: int,
|
||||
extraction_strategy: ExtractionStrategy,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
css_selector: str,
|
||||
screenshot: str,
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.perf_counter()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.perf_counter()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
css_selector=css_selector,
|
||||
only_text=kwargs.get("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
|
||||
t1 = time.perf_counter()
|
||||
# Check if extraction strategy is type of JsonCssExtractionStrategy
|
||||
if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
|
||||
extraction_strategy.verbose = verbose
|
||||
extracted_content = extraction_strategy.run(url, [html])
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
else:
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=format_html(cleaned_html),
|
||||
markdown=markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html= fit_html,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
|
||||
async def aclear_cache(self):
|
||||
# await async_db_manager.aclear_db()
|
||||
await async_db_manager.cleanup()
|
||||
|
||||
async def aflush_cache(self):
|
||||
await async_db_manager.aflush_db()
|
||||
|
||||
async def aget_cache_size(self):
|
||||
return await async_db_manager.aget_total_count()
|
||||
|
||||
|
||||
@@ -7,14 +7,14 @@ from pathlib import Path
|
||||
from typing import Optional, List, Union
|
||||
import json
|
||||
import asyncio
|
||||
from .models import CrawlResult
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .content_filter_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from .async_logger import AsyncLogger
|
||||
|
||||
from .config import (
|
||||
@@ -476,8 +476,8 @@ class AsyncWebCrawler:
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
css_selector=css_selector,
|
||||
only_text=kwargs.get("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.get(
|
||||
only_text=kwargs.pop("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.pop(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
),
|
||||
content_filter = content_filter,
|
||||
@@ -491,6 +491,8 @@ class AsyncWebCrawler:
|
||||
except Exception as e:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||
|
||||
markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None)
|
||||
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||
@@ -542,6 +544,7 @@ class AsyncWebCrawler:
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=format_html(cleaned_html),
|
||||
markdown_v2=markdown_v2,
|
||||
markdown=markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html= fit_html,
|
||||
|
||||
@@ -10,6 +10,13 @@ from abc import ABC, abstractmethod
|
||||
|
||||
from snowballstemmer import stemmer
|
||||
|
||||
|
||||
# import regex
|
||||
# def tokenize_text(text):
|
||||
# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters
|
||||
# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]'
|
||||
# return regex.findall(pattern, text)
|
||||
|
||||
# from nltk.stem import PorterStemmer
|
||||
# ps = PorterStemmer()
|
||||
class RelevantContentFilter(ABC):
|
||||
@@ -57,9 +64,14 @@ class RelevantContentFilter(ABC):
|
||||
query_parts = []
|
||||
|
||||
# Title
|
||||
if soup.title:
|
||||
query_parts.append(soup.title.string)
|
||||
elif soup.find('h1'):
|
||||
try:
|
||||
title = soup.title.string
|
||||
if title:
|
||||
query_parts.append(title)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if soup.find('h1'):
|
||||
query_parts.append(soup.find('h1').get_text())
|
||||
|
||||
# Meta tags
|
||||
@@ -81,7 +93,7 @@ class RelevantContentFilter(ABC):
|
||||
return ' '.join(filter(None, query_parts))
|
||||
|
||||
|
||||
def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]:
|
||||
def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Extracts text chunks from a BeautifulSoup body element while preserving order.
|
||||
Returns list of tuples (text, tag_name) for classification.
|
||||
@@ -155,6 +167,9 @@ class RelevantContentFilter(ABC):
|
||||
if text:
|
||||
chunks.append((chunk_index, text, 'content', body))
|
||||
|
||||
if min_word_threshold:
|
||||
chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
@@ -274,15 +289,26 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
}
|
||||
self.stemmer = stemmer(language)
|
||||
|
||||
def filter_content(self, html: str) -> List[str]:
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""Implements content filtering using BM25 algorithm with priority tag handling"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Check if body is present
|
||||
if not soup.body:
|
||||
# Wrap in body tag if missing
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
body = soup.find('body')
|
||||
query = self.extract_page_query(soup.find('head'), body)
|
||||
candidates = self.extract_text_chunks(body)
|
||||
|
||||
query = self.extract_page_query(soup, body)
|
||||
|
||||
if not query:
|
||||
return []
|
||||
# return [self.clean_element(soup)]
|
||||
|
||||
candidates = self.extract_text_chunks(body, min_word_threshold)
|
||||
|
||||
if not candidates:
|
||||
return []
|
||||
@@ -299,6 +325,10 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
for _, chunk, _, _ in candidates]
|
||||
tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
|
||||
|
||||
# tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
|
||||
# for _, chunk, _, _ in candidates]
|
||||
# tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
|
||||
|
||||
# Clean from stop words and noise
|
||||
tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
|
||||
tokenized_query = clean_tokens(tokenized_query)
|
||||
@@ -326,3 +356,147 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
selected_candidates.sort(key=lambda x: x[0])
|
||||
|
||||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||||
|
||||
|
||||
class HeuristicContentFilter(RelevantContentFilter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Weights for different heuristics
|
||||
self.tag_weights = {
|
||||
'article': 10,
|
||||
'main': 8,
|
||||
'section': 5,
|
||||
'div': 3,
|
||||
'p': 2,
|
||||
'pre': 2,
|
||||
'code': 2,
|
||||
'blockquote': 2,
|
||||
'li': 1,
|
||||
'span': 1,
|
||||
}
|
||||
self.max_depth = 5 # Maximum depth from body to consider
|
||||
|
||||
def filter_content(self, html: str) -> List[str]:
|
||||
"""Implements heuristic content filtering without relying on a query."""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Ensure there is a body tag
|
||||
if not soup.body:
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
body = soup.body
|
||||
|
||||
# Extract candidate text chunks
|
||||
candidates = self.extract_text_chunks(body)
|
||||
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
# Score each candidate
|
||||
scored_candidates = []
|
||||
for index, text, tag_type, tag in candidates:
|
||||
score = self.score_element(tag, text)
|
||||
if score > 0:
|
||||
scored_candidates.append((score, index, text, tag))
|
||||
|
||||
# Sort candidates by score and then by document order
|
||||
scored_candidates.sort(key=lambda x: (-x[0], x[1]))
|
||||
|
||||
# Extract the top candidates (e.g., top 5)
|
||||
top_candidates = scored_candidates[:5] # Adjust the number as needed
|
||||
|
||||
# Sort the top candidates back to their original document order
|
||||
top_candidates.sort(key=lambda x: x[1])
|
||||
|
||||
# Clean and return the content
|
||||
return [self.clean_element(tag) for _, _, _, tag in top_candidates]
|
||||
|
||||
def score_element(self, tag: Tag, text: str) -> float:
|
||||
"""Compute a score for an element based on heuristics."""
|
||||
if not text or not tag:
|
||||
return 0
|
||||
|
||||
# Exclude unwanted tags
|
||||
if self.is_excluded(tag):
|
||||
return 0
|
||||
|
||||
# Text density
|
||||
text_length = len(text.strip())
|
||||
html_length = len(str(tag))
|
||||
text_density = text_length / html_length if html_length > 0 else 0
|
||||
|
||||
# Link density
|
||||
link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a'))
|
||||
link_density = link_text_length / text_length if text_length > 0 else 0
|
||||
|
||||
# Tag weight
|
||||
tag_weight = self.tag_weights.get(tag.name, 1)
|
||||
|
||||
# Depth factor (prefer elements closer to the body tag)
|
||||
depth = self.get_depth(tag)
|
||||
depth_weight = max(self.max_depth - depth, 1) / self.max_depth
|
||||
|
||||
# Compute the final score
|
||||
score = (text_density * tag_weight * depth_weight) / (1 + link_density)
|
||||
|
||||
return score
|
||||
|
||||
def get_depth(self, tag: Tag) -> int:
|
||||
"""Compute the depth of the tag from the body tag."""
|
||||
depth = 0
|
||||
current = tag
|
||||
while current and current != current.parent and current.name != 'body':
|
||||
current = current.parent
|
||||
depth += 1
|
||||
return depth
|
||||
|
||||
def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]:
|
||||
"""
|
||||
Extracts text chunks from the body element while preserving order.
|
||||
Returns list of tuples (index, text, tag_type, tag) for scoring.
|
||||
"""
|
||||
chunks = []
|
||||
index = 0
|
||||
|
||||
def traverse(element):
|
||||
nonlocal index
|
||||
if isinstance(element, NavigableString):
|
||||
return
|
||||
if not isinstance(element, Tag):
|
||||
return
|
||||
if self.is_excluded(element):
|
||||
return
|
||||
# Only consider included tags
|
||||
if element.name in self.included_tags:
|
||||
text = element.get_text(separator=' ', strip=True)
|
||||
if len(text.split()) >= self.min_word_count:
|
||||
tag_type = 'header' if element.name in self.header_tags else 'content'
|
||||
chunks.append((index, text, tag_type, element))
|
||||
index += 1
|
||||
# Do not traverse children of this element to prevent duplication
|
||||
return
|
||||
for child in element.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(body)
|
||||
return chunks
|
||||
|
||||
def is_excluded(self, tag: Tag) -> bool:
|
||||
"""Determine if a tag should be excluded based on heuristics."""
|
||||
if tag.name in self.excluded_tags:
|
||||
return True
|
||||
class_id = ' '.join(filter(None, [
|
||||
' '.join(tag.get('class', [])),
|
||||
tag.get('id', '')
|
||||
]))
|
||||
if self.negative_patterns.search(class_id):
|
||||
return True
|
||||
# Exclude tags with high link density (e.g., navigation menus)
|
||||
text = tag.get_text(separator=' ', strip=True)
|
||||
link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a'))
|
||||
text_length = len(text)
|
||||
if text_length > 0 and (link_text_length / text_length) > 0.5:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import re # Point 1: Pre-Compile Regular Expressions
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
from typing import Dict, Any, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import asyncio, requests, re, os
|
||||
@@ -9,103 +9,19 @@ from bs4 import element, NavigableString, Comment
|
||||
from urllib.parse import urljoin
|
||||
from requests.exceptions import InvalidSchema
|
||||
# from .content_cleaning_strategy import ContentCleaningStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
|
||||
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy
|
||||
from .models import MarkdownGenerationResult
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
sanitize_html,
|
||||
extract_metadata,
|
||||
InvalidCSSSelectorError,
|
||||
# CustomHTML2Text,
|
||||
CustomHTML2Text,
|
||||
normalize_url,
|
||||
is_external_url
|
||||
|
||||
is_external_url
|
||||
)
|
||||
|
||||
from .html2text import HTML2Text
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.inside_pre = False
|
||||
self.inside_code = False
|
||||
self.preserve_tags = set() # Set of tags to preserve
|
||||
self.current_preserved_tag = None
|
||||
self.preserved_content = []
|
||||
self.preserve_depth = 0
|
||||
|
||||
# Configuration options
|
||||
self.skip_internal_links = False
|
||||
self.single_line_break = False
|
||||
self.mark_code = False
|
||||
self.include_sup_sub = False
|
||||
self.body_width = 0
|
||||
self.ignore_mailto_links = True
|
||||
self.ignore_links = False
|
||||
self.escape_backslash = False
|
||||
self.escape_dot = False
|
||||
self.escape_plus = False
|
||||
self.escape_dash = False
|
||||
self.escape_snob = False
|
||||
|
||||
def update_params(self, **kwargs):
|
||||
"""Update parameters and set preserved tags."""
|
||||
for key, value in kwargs.items():
|
||||
if key == 'preserve_tags':
|
||||
self.preserve_tags = set(value)
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
# Handle preserved tags
|
||||
if tag in self.preserve_tags:
|
||||
if start:
|
||||
if self.preserve_depth == 0:
|
||||
self.current_preserved_tag = tag
|
||||
self.preserved_content = []
|
||||
# Format opening tag with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
self.preserve_depth += 1
|
||||
return
|
||||
else:
|
||||
self.preserve_depth -= 1
|
||||
if self.preserve_depth == 0:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
# Output the preserved HTML block with proper spacing
|
||||
preserved_html = ''.join(self.preserved_content)
|
||||
self.o('\n' + preserved_html + '\n')
|
||||
self.current_preserved_tag = None
|
||||
return
|
||||
|
||||
# If we're inside a preserved tag, collect all content
|
||||
if self.preserve_depth > 0:
|
||||
if start:
|
||||
# Format nested tags with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
else:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
return
|
||||
|
||||
# Handle pre tags
|
||||
if tag == 'pre':
|
||||
if start:
|
||||
self.o('```\n')
|
||||
self.inside_pre = True
|
||||
else:
|
||||
self.o('\n```')
|
||||
self.inside_pre = False
|
||||
# elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
# pass
|
||||
else:
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def handle_data(self, data, entity_char=False):
|
||||
"""Override handle_data to capture content within preserved tags."""
|
||||
if self.preserve_depth > 0:
|
||||
self.preserved_content.append(data)
|
||||
return
|
||||
super().handle_data(data, entity_char)
|
||||
from .tools import profile_and_time
|
||||
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r'^og:')
|
||||
@@ -164,6 +80,97 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
||||
|
||||
|
||||
def _generate_markdown_content(self,
|
||||
cleaned_html: str,
|
||||
html: str,
|
||||
url: str,
|
||||
success: bool,
|
||||
**kwargs) -> Dict[str, Any]:
|
||||
"""Generate markdown content using either new strategy or legacy method.
|
||||
|
||||
Args:
|
||||
cleaned_html: Sanitized HTML content
|
||||
html: Original HTML content
|
||||
url: Base URL of the page
|
||||
success: Whether scraping was successful
|
||||
**kwargs: Additional options including:
|
||||
- markdown_generator: Optional[MarkdownGenerationStrategy]
|
||||
- html2text: Dict[str, Any] options for HTML2Text
|
||||
- content_filter: Optional[RelevantContentFilter]
|
||||
- fit_markdown: bool
|
||||
- fit_markdown_user_query: Optional[str]
|
||||
- fit_markdown_bm25_threshold: float
|
||||
|
||||
Returns:
|
||||
Dict containing markdown content in various formats
|
||||
"""
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy())
|
||||
|
||||
if markdown_generator:
|
||||
try:
|
||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=url,
|
||||
html2text_options=kwargs.get('html2text', {}),
|
||||
content_filter=kwargs.get('content_filter', None)
|
||||
)
|
||||
|
||||
return {
|
||||
'markdown': markdown_result.raw_markdown,
|
||||
'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': markdown_result
|
||||
}
|
||||
except Exception as e:
|
||||
self._log('error',
|
||||
message="Error using new markdown generation strategy: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
markdown_generator = None
|
||||
return {
|
||||
'markdown': f"Error using new markdown generation strategy: {str(e)}",
|
||||
'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': None
|
||||
}
|
||||
|
||||
# Legacy method
|
||||
h = CustomHTML2Text()
|
||||
h.update_params(**kwargs.get('html2text', {}))
|
||||
markdown = h.handle(cleaned_html)
|
||||
markdown = markdown.replace(' ```', '```')
|
||||
|
||||
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
|
||||
if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
|
||||
content_filter = kwargs.get('content_filter', None)
|
||||
if not content_filter:
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query=kwargs.get('fit_markdown_user_query', None),
|
||||
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
)
|
||||
fit_html = content_filter.filter_content(html)
|
||||
fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
|
||||
markdown_v2 = MarkdownGenerationResult(
|
||||
raw_markdown=markdown,
|
||||
markdown_with_citations=markdown,
|
||||
references_markdown=markdown,
|
||||
fit_markdown=fit_markdown
|
||||
)
|
||||
|
||||
return {
|
||||
'markdown': markdown,
|
||||
'fit_markdown': fit_markdown,
|
||||
'fit_html': fit_html,
|
||||
'markdown_v2' : markdown_v2
|
||||
}
|
||||
|
||||
|
||||
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||
success = True
|
||||
if not html:
|
||||
@@ -226,7 +233,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def process_image(img, url, index, total_images):
|
||||
def process_image_old(img, url, index, total_images):
|
||||
|
||||
|
||||
#Check if an image has valid display and inside undesired html elements
|
||||
def is_valid_image(img, parent, parent_classes):
|
||||
style = img.get('style', '')
|
||||
@@ -242,8 +251,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
|
||||
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
image_width = img.get('width')
|
||||
@@ -277,14 +284,14 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
score+=1
|
||||
return score
|
||||
|
||||
|
||||
|
||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||
return None
|
||||
|
||||
score = score_image_for_usefulness(img, url, index, total_images)
|
||||
if score <= IMAGE_SCORE_THRESHOLD:
|
||||
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
return {
|
||||
|
||||
base_result = {
|
||||
'src': img.get('src', ''),
|
||||
'data-src': img.get('data-src', ''),
|
||||
'alt': img.get('alt', ''),
|
||||
@@ -293,6 +300,113 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
'type': 'image'
|
||||
}
|
||||
|
||||
sources = []
|
||||
srcset = img.get('srcset', '')
|
||||
if srcset:
|
||||
sources = parse_srcset(srcset)
|
||||
if sources:
|
||||
return [dict(base_result, src=source['url'], width=source['width'])
|
||||
for source in sources]
|
||||
|
||||
return [base_result] # Always return a list
|
||||
|
||||
def process_image(img, url, index, total_images):
|
||||
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
if ' ' in u else None}
|
||||
for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(['button', 'icon', 'logo'])
|
||||
tags_to_check = frozenset(['button', 'input'])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get('style', '')
|
||||
alt = img.get('alt', '')
|
||||
src = img.get('src', '')
|
||||
data_src = img.get('data-src', '')
|
||||
width = img.get('width')
|
||||
height = img.get('height')
|
||||
parent = img.parent
|
||||
parent_classes = parent.get('class', [])
|
||||
|
||||
# Quick validation checks
|
||||
if ('display:none' in style or
|
||||
parent.name in tags_to_check or
|
||||
any(c in cls for c in parent_classes for cls in classes_to_check) or
|
||||
any(c in src for c in classes_to_check) or
|
||||
any(c in alt for c in classes_to_check)):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index/total_images < 0.5
|
||||
|
||||
image_format = ''
|
||||
if "data:image/" in src:
|
||||
image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
else:
|
||||
image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
score += 1
|
||||
|
||||
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
base_info = {
|
||||
'alt': alt,
|
||||
'desc': find_closest_parent_with_useful_text(img),
|
||||
'score': score,
|
||||
'type': 'image',
|
||||
'group_id': group_id # Group ID for this set of variants
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith('data:') and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, 'src': src, 'width': width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ('srcset', 'data-srcset'):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source['url'], source['width'])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent('picture'):
|
||||
for source in picture.find_all('source'):
|
||||
if srcset := source.get('srcset'):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src['url'], src['width'])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
@@ -484,13 +598,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
links['internal'] = list(internal_links_dict.values())
|
||||
links['external'] = list(external_links_dict.values())
|
||||
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all('img')
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
|
||||
media['images'] = [result for result in image_results if result is not None]
|
||||
# For test we use for loop instead of thread
|
||||
media['images'] = [
|
||||
img for result in (process_image(img, url, i, len(imgs))
|
||||
for i, img in enumerate(imgs))
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
def flatten_nested_elements(node):
|
||||
if isinstance(node, NavigableString):
|
||||
@@ -545,41 +662,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
try:
|
||||
h = CustomHTML2Text()
|
||||
h.update_params(**kwargs.get('html2text', {}))
|
||||
markdown = h.handle(cleaned_html)
|
||||
except Exception as e:
|
||||
if not h:
|
||||
h = CustomHTML2Text()
|
||||
self._log('error',
|
||||
message="Error converting HTML to markdown: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
markdown = h.handle(sanitize_html(cleaned_html))
|
||||
markdown = markdown.replace(' ```', '```')
|
||||
|
||||
markdown_content = self._generate_markdown_content(
|
||||
cleaned_html=cleaned_html,
|
||||
html=html,
|
||||
url=url,
|
||||
success=success,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
|
||||
content_filter = kwargs.get('content_filter', None)
|
||||
if not content_filter:
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query= kwargs.get('fit_markdown_user_query', None),
|
||||
bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
)
|
||||
fit_html = content_filter.filter_content(html)
|
||||
fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
|
||||
cleaned_html = sanitize_html(cleaned_html)
|
||||
return {
|
||||
'markdown': markdown,
|
||||
'fit_markdown': fit_markdown,
|
||||
'fit_html': fit_html,
|
||||
**markdown_content,
|
||||
'cleaned_html': cleaned_html,
|
||||
'success': success,
|
||||
'media': media,
|
||||
@@ -283,7 +283,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
except InvalidArgumentException as e:
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||
|
||||
116
crawl4ai/markdown_generation_strategy.py
Normal file
116
crawl4ai/markdown_generation_strategy.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from .models import MarkdownGenerationResult
|
||||
from .utils import CustomHTML2Text
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
# Pre-compile the regex pattern
|
||||
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
||||
|
||||
class MarkdownGenerationStrategy(ABC):
|
||||
"""Abstract base class for markdown generation strategies."""
|
||||
|
||||
@abstractmethod
|
||||
def generate_markdown(self,
|
||||
cleaned_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""Generate markdown from cleaned HTML."""
|
||||
pass
|
||||
|
||||
class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
|
||||
"""Default implementation of markdown generation strategy."""
|
||||
|
||||
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
||||
link_map = {}
|
||||
url_cache = {} # Cache for URL joins
|
||||
parts = []
|
||||
last_end = 0
|
||||
counter = 1
|
||||
|
||||
for match in LINK_PATTERN.finditer(markdown):
|
||||
parts.append(markdown[last_end:match.start()])
|
||||
text, url, title = match.groups()
|
||||
|
||||
# Use cached URL if available, otherwise compute and cache
|
||||
if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
|
||||
if url not in url_cache:
|
||||
url_cache[url] = fast_urljoin(base_url, url)
|
||||
url = url_cache[url]
|
||||
|
||||
if url not in link_map:
|
||||
desc = []
|
||||
if title: desc.append(title)
|
||||
if text and text != title: desc.append(text)
|
||||
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
|
||||
counter += 1
|
||||
|
||||
num = link_map[url][0]
|
||||
parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
|
||||
last_end = match.end()
|
||||
|
||||
parts.append(markdown[last_end:])
|
||||
converted_text = ''.join(parts)
|
||||
|
||||
# Pre-build reference strings
|
||||
references = ["\n\n## References\n\n"]
|
||||
references.extend(
|
||||
f"⟨{num}⟩ {url}{desc}\n"
|
||||
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
|
||||
)
|
||||
|
||||
return converted_text, ''.join(references)
|
||||
|
||||
def generate_markdown(self,
|
||||
cleaned_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""Generate markdown with citations from cleaned HTML."""
|
||||
# Initialize HTML2Text with options
|
||||
h = CustomHTML2Text()
|
||||
if html2text_options:
|
||||
h.update_params(**html2text_options)
|
||||
|
||||
# Generate raw markdown
|
||||
raw_markdown = h.handle(cleaned_html)
|
||||
raw_markdown = raw_markdown.replace(' ```', '```')
|
||||
|
||||
# Convert links to citations
|
||||
if citations:
|
||||
markdown_with_citations, references_markdown = self.convert_links_to_citations(
|
||||
raw_markdown, base_url
|
||||
)
|
||||
|
||||
# Generate fit markdown if content filter is provided
|
||||
fit_markdown: Optional[str] = None
|
||||
if content_filter:
|
||||
filtered_html = content_filter.filter_content(cleaned_html)
|
||||
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
|
||||
fit_markdown = h.handle(filtered_html)
|
||||
|
||||
return MarkdownGenerationResult(
|
||||
raw_markdown=raw_markdown,
|
||||
markdown_with_citations=markdown_with_citations,
|
||||
references_markdown=references_markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html=filtered_html
|
||||
)
|
||||
|
||||
def fast_urljoin(base: str, url: str) -> str:
|
||||
"""Fast URL joining for common cases."""
|
||||
if url.startswith(('http://', 'https://', 'mailto:', '//')):
|
||||
return url
|
||||
if url.startswith('/'):
|
||||
# Handle absolute paths
|
||||
if base.endswith('/'):
|
||||
return base[:-1] + url
|
||||
return base + url
|
||||
return urljoin(base, url)
|
||||
@@ -1,5 +1,5 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Dict, Optional, Callable, Awaitable
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union
|
||||
|
||||
|
||||
|
||||
@@ -7,6 +7,13 @@ class UrlModel(BaseModel):
|
||||
url: HttpUrl
|
||||
forced: bool = False
|
||||
|
||||
class MarkdownGenerationResult(BaseModel):
|
||||
raw_markdown: str
|
||||
markdown_with_citations: str
|
||||
references_markdown: str
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
@@ -16,7 +23,8 @@ class CrawlResult(BaseModel):
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
screenshot: Optional[str] = None
|
||||
markdown: Optional[str] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
extracted_content: Optional[str] = None
|
||||
@@ -36,3 +44,5 @@ class AsyncCrawlResponse(BaseModel):
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
|
||||
34
crawl4ai/tools.py
Normal file
34
crawl4ai/tools.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import time
|
||||
import cProfile
|
||||
import pstats
|
||||
from functools import wraps
|
||||
|
||||
def profile_and_time(func):
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
# Start timer
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Setup profiler
|
||||
profiler = cProfile.Profile()
|
||||
profiler.enable()
|
||||
|
||||
# Run function
|
||||
result = func(self, *args, **kwargs)
|
||||
|
||||
# Stop profiler
|
||||
profiler.disable()
|
||||
|
||||
# Calculate elapsed time
|
||||
elapsed_time = time.perf_counter() - start_time
|
||||
|
||||
# Print timing
|
||||
print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds")
|
||||
|
||||
# Print profiling stats
|
||||
stats = pstats.Stats(profiler)
|
||||
stats.sort_stats('cumulative') # Sort by cumulative time
|
||||
stats.print_stats(20) # Print top 20 time-consuming functions
|
||||
|
||||
return result
|
||||
return wrapper
|
||||
@@ -18,6 +18,94 @@ import hashlib
|
||||
from typing import Optional, Tuple, Dict, Any
|
||||
import xxhash
|
||||
|
||||
|
||||
from .html2text import HTML2Text
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.inside_pre = False
|
||||
self.inside_code = False
|
||||
self.preserve_tags = set() # Set of tags to preserve
|
||||
self.current_preserved_tag = None
|
||||
self.preserved_content = []
|
||||
self.preserve_depth = 0
|
||||
|
||||
# Configuration options
|
||||
self.skip_internal_links = False
|
||||
self.single_line_break = False
|
||||
self.mark_code = False
|
||||
self.include_sup_sub = False
|
||||
self.body_width = 0
|
||||
self.ignore_mailto_links = True
|
||||
self.ignore_links = False
|
||||
self.escape_backslash = False
|
||||
self.escape_dot = False
|
||||
self.escape_plus = False
|
||||
self.escape_dash = False
|
||||
self.escape_snob = False
|
||||
|
||||
def update_params(self, **kwargs):
|
||||
"""Update parameters and set preserved tags."""
|
||||
for key, value in kwargs.items():
|
||||
if key == 'preserve_tags':
|
||||
self.preserve_tags = set(value)
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
# Handle preserved tags
|
||||
if tag in self.preserve_tags:
|
||||
if start:
|
||||
if self.preserve_depth == 0:
|
||||
self.current_preserved_tag = tag
|
||||
self.preserved_content = []
|
||||
# Format opening tag with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
self.preserve_depth += 1
|
||||
return
|
||||
else:
|
||||
self.preserve_depth -= 1
|
||||
if self.preserve_depth == 0:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
# Output the preserved HTML block with proper spacing
|
||||
preserved_html = ''.join(self.preserved_content)
|
||||
self.o('\n' + preserved_html + '\n')
|
||||
self.current_preserved_tag = None
|
||||
return
|
||||
|
||||
# If we're inside a preserved tag, collect all content
|
||||
if self.preserve_depth > 0:
|
||||
if start:
|
||||
# Format nested tags with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
else:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
return
|
||||
|
||||
# Handle pre tags
|
||||
if tag == 'pre':
|
||||
if start:
|
||||
self.o('```\n')
|
||||
self.inside_pre = True
|
||||
else:
|
||||
self.o('\n```')
|
||||
self.inside_pre = False
|
||||
# elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
# pass
|
||||
else:
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def handle_data(self, data, entity_char=False):
|
||||
"""Override handle_data to capture content within preserved tags."""
|
||||
if self.preserve_depth > 0:
|
||||
self.preserved_content.append(data)
|
||||
return
|
||||
super().handle_data(data, entity_char)
|
||||
|
||||
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
# Railway Deployment
|
||||
|
||||
## Quick Deploy
|
||||
[](https://railway.app/template/crawl4ai)
|
||||
|
||||
## Manual Setup
|
||||
1. Fork this repository
|
||||
2. Create a new Railway project
|
||||
3. Configure environment variables:
|
||||
- `INSTALL_TYPE`: basic or all
|
||||
- `ENABLE_GPU`: true/false
|
||||
4. Deploy!
|
||||
|
||||
## Configuration
|
||||
See `railway.toml` for:
|
||||
- Memory limits
|
||||
- Health checks
|
||||
- Restart policies
|
||||
- Scaling options
|
||||
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"name": "Crawl4AI",
|
||||
"description": "LLM Friendly Web Crawler & Scraper",
|
||||
"render": {
|
||||
"dockerfile": {
|
||||
"path": "Dockerfile"
|
||||
}
|
||||
},
|
||||
"env": [
|
||||
{
|
||||
"key": "INSTALL_TYPE",
|
||||
"description": "Installation type (basic/all)",
|
||||
"default": "basic",
|
||||
"required": true
|
||||
},
|
||||
{
|
||||
"key": "ENABLE_GPU",
|
||||
"description": "Enable GPU support",
|
||||
"default": "false",
|
||||
"required": false
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{
|
||||
"name": "web",
|
||||
"dockerfile": "./Dockerfile",
|
||||
"healthcheck": {
|
||||
"path": "/health",
|
||||
"port": 11235
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
# railway.toml
|
||||
[build]
|
||||
builder = "DOCKERFILE"
|
||||
dockerfilePath = "Dockerfile"
|
||||
|
||||
[deploy]
|
||||
startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT"
|
||||
healthcheckPath = "/health"
|
||||
restartPolicyType = "ON_FAILURE"
|
||||
restartPolicyMaxRetries = 3
|
||||
|
||||
[deploy.memory]
|
||||
soft = 2048 # 2GB min for Playwright
|
||||
hard = 4096 # 4GB max
|
||||
|
||||
[deploy.scaling]
|
||||
min = 1
|
||||
max = 1
|
||||
@@ -4,8 +4,8 @@ services:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
PYTHON_VERSION: 3.10
|
||||
INSTALL_TYPE: all
|
||||
PYTHON_VERSION: "3.10"
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
||||
ENABLE_GPU: false
|
||||
profiles: ["local"]
|
||||
ports:
|
||||
|
||||
@@ -52,34 +52,7 @@ async def download_example():
|
||||
else:
|
||||
print("\nNo files were downloaded")
|
||||
|
||||
# 2. Content Filtering with BM25 Example
|
||||
async def content_filtering_example():
|
||||
"""Example of using the new BM25 content filtering"""
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create filter with custom query for OpenAI's blog
|
||||
content_filter = BM25ContentFilter(
|
||||
# user_query="Investment and fundraising",
|
||||
# user_query="Robotic",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://techcrunch.com/",
|
||||
content_filter=content_filter,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
print(f"Filtered content: {len(result.fit_markdown)}")
|
||||
print(f"Filtered content: {result.fit_markdown}")
|
||||
|
||||
# Save html
|
||||
with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
|
||||
f.write(result.fit_html)
|
||||
|
||||
with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
|
||||
f.write(result.fit_markdown)
|
||||
|
||||
# 3. Local File and Raw HTML Processing Example
|
||||
# 2. Local File and Raw HTML Processing Example
|
||||
async def local_and_raw_html_example():
|
||||
"""Example of processing local files and raw HTML"""
|
||||
# Create a sample HTML file
|
||||
@@ -115,6 +88,68 @@ async def local_and_raw_html_example():
|
||||
print("Local file content:", local_result.markdown)
|
||||
print("\nRaw HTML content:", raw_result.markdown)
|
||||
|
||||
# 3. Enhanced Markdown Generation Example
|
||||
async def markdown_generation_example():
|
||||
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create a content filter (optional)
|
||||
content_filter = BM25ContentFilter(
|
||||
# user_query="History and cultivation",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=content_filter,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=BM25ContentFilter()
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
|
||||
print("\nMarkdown Generation Results:")
|
||||
print(f"1. Original markdown length: {len(result.markdown)}")
|
||||
print(f"2. New markdown versions (markdown_v2):")
|
||||
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
|
||||
if result.markdown_v2.fit_markdown:
|
||||
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
|
||||
|
||||
# Save examples to files
|
||||
output_dir = os.path.join(__data__, "markdown_examples")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Save different versions
|
||||
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
||||
f.write(result.markdown_v2.references_markdown)
|
||||
|
||||
if result.markdown_v2.fit_markdown:
|
||||
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
print(f"\nMarkdown examples saved to: {output_dir}")
|
||||
|
||||
# Show a sample of citations and references
|
||||
print("\nSample of markdown with citations:")
|
||||
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
||||
print("Sample of references:")
|
||||
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
|
||||
|
||||
# 4. Browser Management Example
|
||||
async def browser_management_example():
|
||||
"""Example of using enhanced browser management features"""
|
||||
@@ -208,9 +243,13 @@ async def api_example():
|
||||
headers=headers
|
||||
) as status_response:
|
||||
result = await status_response.json()
|
||||
print(f"Task result: {result}")
|
||||
print(f"Task status: {result['status']}")
|
||||
|
||||
if result["status"] == "completed":
|
||||
print("Task completed!")
|
||||
print("Results:")
|
||||
news = json.loads(result["results"][0]['extracted_content'])
|
||||
print(json.dumps(news[:4], indent=2))
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(1)
|
||||
@@ -220,15 +259,15 @@ async def main():
|
||||
# print("Running Crawl4AI feature examples...")
|
||||
|
||||
# print("\n1. Running Download Example:")
|
||||
await download_example()
|
||||
# await download_example()
|
||||
|
||||
# print("\n2. Running Content Filtering Example:")
|
||||
await content_filtering_example()
|
||||
# print("\n2. Running Markdown Generation Example:")
|
||||
# await markdown_generation_example()
|
||||
|
||||
# print("\n3. Running Local and Raw HTML Example:")
|
||||
await local_and_raw_html_example()
|
||||
# # print("\n3. Running Local and Raw HTML Example:")
|
||||
# await local_and_raw_html_example()
|
||||
|
||||
# print("\n4. Running Browser Management Example:")
|
||||
# # print("\n4. Running Browser Management Example:")
|
||||
await browser_management_example()
|
||||
|
||||
# print("\n5. Running API Example:")
|
||||
|
||||
131
pages/app.css
131
pages/app.css
@@ -1,131 +0,0 @@
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif,
|
||||
BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji",
|
||||
"Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
/* Custom styling for docs-item class and Markdown generated elements */
|
||||
.docs-item {
|
||||
background-color: #2d3748; /* bg-gray-800 */
|
||||
padding: 1rem; /* p-4 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
|
||||
margin-bottom: 1rem; /* space between items */
|
||||
line-height: 1.5; /* leading-normal */
|
||||
}
|
||||
|
||||
.docs-item h3,
|
||||
.docs-item h4 {
|
||||
color: #ffffff; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
.docs-item h4 {
|
||||
font-size: 1rem; /* text-xl */
|
||||
}
|
||||
|
||||
.docs-item p {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item code {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.25rem 0.5rem; /* px-2 py-1 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
font-size: 0.875rem; /* text-sm */
|
||||
}
|
||||
|
||||
.docs-item pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
overflow: auto; /* overflow-auto */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item div {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
font-size: 1rem; /* prose prose-sm */
|
||||
line-height: 1.25rem; /* line-height for readability */
|
||||
}
|
||||
|
||||
/* Adjustments to make prose class more suitable for dark mode */
|
||||
.prose {
|
||||
max-width: none; /* max-w-none */
|
||||
}
|
||||
|
||||
.prose p,
|
||||
.prose ul {
|
||||
margin-bottom: 1rem; /* mb-4 */
|
||||
}
|
||||
|
||||
.prose code {
|
||||
/* background-color: #4a5568; */ /* bg-gray-700 */
|
||||
color: #65a30d; /* text-white */
|
||||
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
display: inline-block; /* inline-block */
|
||||
}
|
||||
|
||||
.prose pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #ffffff; /* text-white */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
}
|
||||
|
||||
.prose h3 {
|
||||
color: #65a30d; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
body {
|
||||
background-color: #1a1a1a;
|
||||
color: #b3ff00;
|
||||
}
|
||||
.sidebar {
|
||||
color: #b3ff00;
|
||||
border-right: 1px solid #333;
|
||||
}
|
||||
.sidebar a {
|
||||
color: #b3ff00;
|
||||
text-decoration: none;
|
||||
}
|
||||
.sidebar a:hover {
|
||||
background-color: #555;
|
||||
}
|
||||
.content-section {
|
||||
display: none;
|
||||
}
|
||||
.content-section.active {
|
||||
display: block;
|
||||
}
|
||||
356
pages/app.js
356
pages/app.js
@@ -1,356 +0,0 @@
|
||||
// JavaScript to manage dynamic form changes and logic
|
||||
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
||||
const strategy = this.value;
|
||||
const providerModelSelect = document.getElementById("provider-model-select");
|
||||
const tokenInput = document.getElementById("token-input");
|
||||
const instruction = document.getElementById("instruction");
|
||||
const semantic_filter = document.getElementById("semantic_filter");
|
||||
const instruction_div = document.getElementById("instruction_div");
|
||||
const semantic_filter_div = document.getElementById("semantic_filter_div");
|
||||
const llm_settings = document.getElementById("llm_settings");
|
||||
|
||||
if (strategy === "LLMExtractionStrategy") {
|
||||
// providerModelSelect.disabled = false;
|
||||
// tokenInput.disabled = false;
|
||||
// semantic_filter.disabled = true;
|
||||
// instruction.disabled = false;
|
||||
llm_settings.classList.remove("hidden");
|
||||
instruction_div.classList.remove("hidden");
|
||||
semantic_filter_div.classList.add("hidden");
|
||||
} else if (strategy === "NoExtractionStrategy") {
|
||||
semantic_filter_div.classList.add("hidden");
|
||||
instruction_div.classList.add("hidden");
|
||||
llm_settings.classList.add("hidden");
|
||||
} else {
|
||||
// providerModelSelect.disabled = true;
|
||||
// tokenInput.disabled = true;
|
||||
// semantic_filter.disabled = false;
|
||||
// instruction.disabled = true;
|
||||
llm_settings.classList.add("hidden");
|
||||
instruction_div.classList.add("hidden");
|
||||
semantic_filter_div.classList.remove("hidden");
|
||||
}
|
||||
|
||||
|
||||
});
|
||||
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
// if selected extraction strategy is LLMExtractionStrategy, then API token is required
|
||||
if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
include_raw_html: true,
|
||||
bypass_cache: bypassCache,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
||||
extraction_strategy_args: {
|
||||
provider: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
instruction: document.getElementById("instruction").value,
|
||||
semantic_filter: document.getElementById("semantic_filter").value,
|
||||
},
|
||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||
chunking_strategy_args: {},
|
||||
css_selector: document.getElementById("css-selector").value,
|
||||
screenshot: document.getElementById("screenshot-checkbox").checked,
|
||||
// instruction: document.getElementById("instruction").value,
|
||||
// semantic_filter: document.getElementById("semantic_filter").value,
|
||||
verbose: true,
|
||||
};
|
||||
|
||||
// import requests
|
||||
|
||||
// data = {
|
||||
// "urls": [
|
||||
// "https://www.nbcnews.com/business"
|
||||
// ],
|
||||
// "word_count_threshold": 10,
|
||||
// "extraction_strategy": "NoExtractionStrategy",
|
||||
// }
|
||||
|
||||
// response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||
// print(response.json())
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").style.visibility = "hidden";
|
||||
document.getElementById("code_help").style.visibility = "hidden";
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
|
||||
if (result.screenshot){
|
||||
const imgElement = document.createElement("img");
|
||||
// Set the src attribute with the base64 data
|
||||
imgElement.src = `data:image/png;base64,${result.screenshot}`;
|
||||
document.getElementById("screenshot-result").innerHTML = "";
|
||||
document.getElementById("screenshot-result").appendChild(imgElement);
|
||||
}
|
||||
|
||||
// Update code examples dynamically
|
||||
const extractionStrategy = data.extraction_strategy;
|
||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||
|
||||
// REMOVE API TOKEN FROM CODE EXAMPLES
|
||||
data.extraction_strategy_args.api_token = "your_api_token";
|
||||
|
||||
if (data.extraction_strategy === "NoExtractionStrategy") {
|
||||
delete data.extraction_strategy_args;
|
||||
delete data.extrac_blocks;
|
||||
}
|
||||
|
||||
if (data.chunking_strategy === "RegexChunking") {
|
||||
delete data.chunking_strategy_args;
|
||||
}
|
||||
|
||||
delete data.verbose;
|
||||
|
||||
if (data.css_selector === "") {
|
||||
delete data.css_selector;
|
||||
}
|
||||
|
||||
if (!data.bypass_cache) {
|
||||
delete data.bypass_cache;
|
||||
}
|
||||
|
||||
if (!data.extract_blocks) {
|
||||
delete data.extract_blocks;
|
||||
}
|
||||
|
||||
if (!data.include_raw_html) {
|
||||
delete data.include_raw_html;
|
||||
}
|
||||
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
}, null, 2)}' https://crawl4ai.com/crawl`;
|
||||
|
||||
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
||||
urls[0]
|
||||
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
||||
isLLMExtraction
|
||||
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
||||
: extractionStrategy + "()"
|
||||
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
||||
data.bypass_cache
|
||||
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
|
||||
document.getElementById("result").style.visibility = "visible";
|
||||
document.getElementById("code_help").style.visibility = "visible";
|
||||
|
||||
// increment the total count
|
||||
document.getElementById("total-count").textContent =
|
||||
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
|
||||
async function copyToClipboard(text) {
|
||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||
return navigator.clipboard.writeText(text);
|
||||
} else {
|
||||
return fallbackCopyTextToClipboard(text);
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackCopyTextToClipboard(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const textArea = document.createElement("textarea");
|
||||
textArea.value = text;
|
||||
|
||||
// Avoid scrolling to bottom
|
||||
textArea.style.top = "0";
|
||||
textArea.style.left = "0";
|
||||
textArea.style.position = "fixed";
|
||||
|
||||
document.body.appendChild(textArea);
|
||||
textArea.focus();
|
||||
textArea.select();
|
||||
|
||||
try {
|
||||
const successful = document.execCommand("copy");
|
||||
if (successful) {
|
||||
resolve();
|
||||
} else {
|
||||
reject();
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
|
||||
document.body.removeChild(textArea);
|
||||
});
|
||||
}
|
||||
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
//navigator.clipboard.writeText(code).then(() => {
|
||||
copyToClipboard(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
document.addEventListener("DOMContentLoaded", async () => {
|
||||
try {
|
||||
const extractionResponse = await fetch("/strategies/extraction");
|
||||
const extractionStrategies = await extractionResponse.json();
|
||||
|
||||
const chunkingResponse = await fetch("/strategies/chunking");
|
||||
const chunkingStrategies = await chunkingResponse.json();
|
||||
|
||||
renderStrategies("extraction-strategies", extractionStrategies);
|
||||
renderStrategies("chunking-strategies", chunkingStrategies);
|
||||
} catch (error) {
|
||||
console.error("Error fetching strategies:", error);
|
||||
}
|
||||
});
|
||||
|
||||
function renderStrategies(containerId, strategies) {
|
||||
const container = document.getElementById(containerId);
|
||||
container.innerHTML = ""; // Clear any existing content
|
||||
strategies = JSON.parse(strategies);
|
||||
Object.entries(strategies).forEach(([strategy, description]) => {
|
||||
const strategyElement = document.createElement("div");
|
||||
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
||||
|
||||
const strategyDescription = document.createElement("div");
|
||||
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
||||
strategyDescription.innerHTML = marked.parse(description);
|
||||
|
||||
strategyElement.appendChild(strategyDescription);
|
||||
|
||||
container.appendChild(strategyElement);
|
||||
});
|
||||
}
|
||||
document.querySelectorAll(".sidebar a").forEach((link) => {
|
||||
link.addEventListener("click", function (event) {
|
||||
event.preventDefault();
|
||||
document.querySelectorAll(".content-section").forEach((section) => {
|
||||
section.classList.remove("active");
|
||||
});
|
||||
const target = event.target.getAttribute("data-target");
|
||||
document.getElementById(target).classList.add("active");
|
||||
});
|
||||
});
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
@@ -1,971 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Crawl4AI</title>
|
||||
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
|
||||
/>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
<style>
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
<style>
|
||||
/* Custom styling for docs-item class and Markdown generated elements */
|
||||
.docs-item {
|
||||
background-color: #2d3748; /* bg-gray-800 */
|
||||
padding: 1rem; /* p-4 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
|
||||
margin-bottom: 1rem; /* space between items */
|
||||
}
|
||||
|
||||
.docs-item h3,
|
||||
.docs-item h4 {
|
||||
color: #ffffff; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item p {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item code {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.25rem 0.5rem; /* px-2 py-1 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
}
|
||||
|
||||
.docs-item pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
overflow: auto; /* overflow-auto */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item div {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
font-size: 1rem; /* prose prose-sm */
|
||||
line-height: 1.25rem; /* line-height for readability */
|
||||
}
|
||||
|
||||
/* Adjustments to make prose class more suitable for dark mode */
|
||||
.prose {
|
||||
max-width: none; /* max-w-none */
|
||||
}
|
||||
|
||||
.prose p,
|
||||
.prose ul {
|
||||
margin-bottom: 1rem; /* mb-4 */
|
||||
}
|
||||
|
||||
.prose code {
|
||||
/* background-color: #4a5568; */ /* bg-gray-700 */
|
||||
color: #65a30d; /* text-white */
|
||||
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
display: inline-block; /* inline-block */
|
||||
}
|
||||
|
||||
.prose pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #ffffff; /* text-white */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
}
|
||||
|
||||
.prose h3 {
|
||||
color: #65a30d; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="bg-black text-gray-200">
|
||||
<header class="bg-zinc-950 text-white py-4 flex">
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
<span id="total-count" class="text-lime-400">2</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="try-it py-8 px-16 pb-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||
<div class="grid grid-cols-1 lg:grid-cols-3 gap-4">
|
||||
<div class="space-y-4">
|
||||
<div class="flex flex-col">
|
||||
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
|
||||
<select
|
||||
id="threshold"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
|
||||
<input
|
||||
type="text"
|
||||
id="css-selector"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter CSS Selector"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Extraction Strategy</label
|
||||
>
|
||||
<select
|
||||
id="extraction-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="CosineStrategy">CosineStrategy</option>
|
||||
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
|
||||
<option value="NoExtractionStrategy">NoExtractionStrategy</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Chunking Strategy</label
|
||||
>
|
||||
<select
|
||||
id="chunking-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="RegexChunking">RegexChunking</option>
|
||||
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
|
||||
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
|
||||
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
|
||||
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
|
||||
>Provider Model</label
|
||||
>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
disabled
|
||||
>
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter Groq API token"
|
||||
disabled
|
||||
/>
|
||||
</div>
|
||||
<div class="flex gap-3">
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="bypass-cache-checkbox" />
|
||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" checked />
|
||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold"
|
||||
>Extract Blocks</label
|
||||
>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">
|
||||
Crawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="result" class=" ">
|
||||
<div id="loading" class="hidden">
|
||||
<p class="text-white">Loading... Please wait.</p>
|
||||
</div>
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="json"
|
||||
>
|
||||
JSON
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="cleaned-html"
|
||||
>
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="markdown"
|
||||
>
|
||||
Markdown
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="markdown-result" class="language-markdown"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="code_help" class=" ">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="curl"
|
||||
>
|
||||
cURL
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="library"
|
||||
>
|
||||
Python Library
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="python"
|
||||
>
|
||||
Python (Request)
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="nodejs"
|
||||
>
|
||||
Node.js
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex relative">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="library-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<div class="grid grid-cols-2 gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🌟 <strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">
|
||||
First Step: Create an instance of WebCrawler and call the <code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the
|
||||
response. By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🤖 <strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
📜 <strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🎯 <strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🖱️ <strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl
|
||||
the web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">Installation 💻</h1>
|
||||
<p class="mb-4">
|
||||
There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local
|
||||
server.
|
||||
</p>
|
||||
|
||||
<p class="mb-4">
|
||||
You can also try Crawl4AI in a Google Colab
|
||||
<a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</p>
|
||||
|
||||
<h2 class="text-2xl font-bold mb-2">Using Crawl4AI as a Library 📚</h2>
|
||||
<p class="mb-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-2">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
|
||||
</li>
|
||||
<li class="mb-2">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
</code></pre>
|
||||
</li>
|
||||
<li>
|
||||
Import the necessary modules in your Python script:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python hljs">from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
import os
|
||||
|
||||
crawler = WebCrawler()
|
||||
|
||||
# Single page crawl
|
||||
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
|
||||
result = crawl4ai.fetch_page(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
|
||||
extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
|
||||
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
|
||||
bypass_cache=False,
|
||||
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
|
||||
css_selector = "", # Eg: "div.article-body"
|
||||
verbose=True,
|
||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
||||
)
|
||||
print(result.model_dump())
|
||||
</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
|
||||
</section>
|
||||
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">📖 Parameters</h1>
|
||||
<div class="overflow-x-auto">
|
||||
<table class="min-w-full bg-zinc-800 border border-zinc-700">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Parameter</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Description</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Required</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Default Value</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">urls</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
A list of URLs to crawl and extract data from.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">Yes</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">-</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">include_raw_html</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to include the raw HTML content in the response.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">bypass_cache</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to force a fresh crawl even if the URL has been previously crawled.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extract_blocks</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to extract semantical blocks of text from the HTML.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">true</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">word_count_threshold</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The minimum number of words a block must contain to be considered meaningful (minimum
|
||||
value is 5).
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">5</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extraction_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">CosineStrategy</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">chunking_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for chunking the text before processing (e.g., "RegexChunking").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">RegexChunking</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">css_selector</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The CSS selector to target specific parts of the HTML for extraction.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">None</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4">verbose</td>
|
||||
<td class="py-2 px-4">Whether to enable verbose logging.</td>
|
||||
<td class="py-2 px-4">No</td>
|
||||
<td class="py-2 px-4">true</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="extraction" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Extraction Strategies</h2>
|
||||
<div id="extraction-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="chunking" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Chunking Strategies</h2>
|
||||
<div id="chunking-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hero bg-zinc-900 py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||
benefit of all. 🤝💪
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="installation py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||
<p class="mb-4">
|
||||
To install and run Crawl4AI as a library or a local server, please refer to the 📚
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-zinc-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// JavaScript to manage dynamic form changes and logic
|
||||
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
||||
const strategy = this.value;
|
||||
const providerModelSelect = document.getElementById("provider-model-select");
|
||||
const tokenInput = document.getElementById("token-input");
|
||||
|
||||
if (strategy === "LLMExtractionStrategy") {
|
||||
providerModelSelect.disabled = false;
|
||||
tokenInput.disabled = false;
|
||||
} else {
|
||||
providerModelSelect.disabled = true;
|
||||
tokenInput.disabled = true;
|
||||
}
|
||||
});
|
||||
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
bypass_cache: bypassCache,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||
css_selector: document.getElementById("css-selector").value,
|
||||
verbose: true,
|
||||
};
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
//document.getElementById("result").classList.add("hidden");
|
||||
//document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
// Update code examples dynamically
|
||||
const extractionStrategy = data.extraction_strategy;
|
||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
})}' http://crawl4ai.uccode.io/crawl`;
|
||||
|
||||
document.getElementById(
|
||||
"python-code"
|
||||
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
||||
urls[0]
|
||||
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
||||
isLLMExtraction
|
||||
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
||||
: extractionStrategy + "()"
|
||||
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
||||
data.bypass_cache
|
||||
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
document.getElementById("result").classList.remove("hidden");
|
||||
document.getElementById("code_help").classList.remove("hidden");
|
||||
|
||||
// increment the total count
|
||||
document.getElementById("total-count").textContent =
|
||||
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".code-tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
|
||||
async function copyToClipboard(text) {
|
||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||
return navigator.clipboard.writeText(text);
|
||||
} else {
|
||||
return fallbackCopyTextToClipboard(text);
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackCopyTextToClipboard(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const textArea = document.createElement("textarea");
|
||||
textArea.value = text;
|
||||
|
||||
// Avoid scrolling to bottom
|
||||
textArea.style.top = "0";
|
||||
textArea.style.left = "0";
|
||||
textArea.style.position = "fixed";
|
||||
|
||||
document.body.appendChild(textArea);
|
||||
textArea.focus();
|
||||
textArea.select();
|
||||
|
||||
try {
|
||||
const successful = document.execCommand("copy");
|
||||
if (successful) {
|
||||
resolve();
|
||||
} else {
|
||||
reject();
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
|
||||
document.body.removeChild(textArea);
|
||||
});
|
||||
}
|
||||
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
//navigator.clipboard.writeText(code).then(() => {
|
||||
copyToClipboard(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
document.addEventListener("DOMContentLoaded", async () => {
|
||||
try {
|
||||
const extractionResponse = await fetch("/strategies/extraction");
|
||||
const extractionStrategies = await extractionResponse.json();
|
||||
|
||||
const chunkingResponse = await fetch("/strategies/chunking");
|
||||
const chunkingStrategies = await chunkingResponse.json();
|
||||
|
||||
renderStrategies("extraction-strategies", extractionStrategies);
|
||||
renderStrategies("chunking-strategies", chunkingStrategies);
|
||||
} catch (error) {
|
||||
console.error("Error fetching strategies:", error);
|
||||
}
|
||||
});
|
||||
|
||||
function renderStrategies(containerId, strategies) {
|
||||
const container = document.getElementById(containerId);
|
||||
container.innerHTML = ""; // Clear any existing content
|
||||
strategies = JSON.parse(strategies);
|
||||
Object.entries(strategies).forEach(([strategy, description]) => {
|
||||
const strategyElement = document.createElement("div");
|
||||
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
||||
|
||||
const strategyDescription = document.createElement("div");
|
||||
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
||||
strategyDescription.innerHTML = marked.parse(description);
|
||||
|
||||
strategyElement.appendChild(strategyDescription);
|
||||
|
||||
container.appendChild(strategyElement);
|
||||
});
|
||||
}
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,73 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Crawl4AI</title>
|
||||
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link rel="stylesheet" href="/pages/app.css" />
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
|
||||
/>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
</head>
|
||||
<body class="bg-black text-gray-200">
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
<span id="total-count" class="text-lime-400">2</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
{{ try_it | safe }}
|
||||
|
||||
<div class="mx-auto p-4 bg-zinc-950 text-lime-500 min-h-screen">
|
||||
<div class="container mx-auto">
|
||||
<div class="flex h-full px-20">
|
||||
<div class="sidebar w-1/4 p-4">
|
||||
<h2 class="text-lg font-bold mb-4">Outline</h2>
|
||||
<ul>
|
||||
<li class="mb-2"><a href="#" data-target="installation">Installation</a></li>
|
||||
<li class="mb-2"><a href="#" data-target="how-to-guide">How to Guide</a></li>
|
||||
<li class="mb-2"><a href="#" data-target="chunking-strategies">Chunking Strategies</a></li>
|
||||
<li class="mb-2">
|
||||
<a href="#" data-target="extraction-strategies">Extraction Strategies</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<!-- Main Content -->
|
||||
<div class="w-3/4 p-4">
|
||||
{{installation | safe}} {{how_to_guide | safe}}
|
||||
|
||||
<section id="chunking-strategies" class="content-section">
|
||||
<h1 class="text-2xl font-bold">Chunking Strategies</h1>
|
||||
<p>Content for chunking strategies...</p>
|
||||
</section>
|
||||
<section id="extraction-strategies" class="content-section">
|
||||
<h1 class="text-2xl font-bold">Extraction Strategies</h1>
|
||||
<p>Content for extraction strategies...</p>
|
||||
</section>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{{ footer | safe }}
|
||||
<script script src="/pages/app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,425 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Crawl4AI</title>
|
||||
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
||||
/>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
<style>
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper</h1>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="try-it py-8 pb-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||
<div class="mb-4 flex w-full gap-2">
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://kidocode.com"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
|
||||
>
|
||||
<!-- Add your option values here -->
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
||||
placeholder="Enter Groq API token"
|
||||
/>
|
||||
<div class="flex items-center justify-center">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
|
||||
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
||||
<div id="loading" class="hidden mt-4">
|
||||
<p>Loading...</p>
|
||||
</div>
|
||||
<div id="result" class="tab-container flex-1 h-full flex-col">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
||||
Markdown
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="cleaned-html-result" class="language-html "></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="markdown-result" class="language-markdown "></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div id="code_help" class="tab-container flex-1 h-full">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
||||
Python
|
||||
</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
||||
Node.js
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<pre class="h-full flex relative">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hero bg-gray-900 py-8">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
|
||||
services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl
|
||||
a web page, and transform it o a form suitable for LLM. We don't think one should build a business
|
||||
out of this, but definilty should be opened source. So if you possess the skills to build such things
|
||||
and you have such philosphy you should join our "Robinhood" band and help set
|
||||
these products free. 🆓🤝
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="installation py-8">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||
<p class="mb-4">
|
||||
To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
|
||||
these steps:
|
||||
</p>
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li>
|
||||
Clone the GitHub repository: 📥
|
||||
<code>git clone https://github.com/unclecode/crawl4ai.git</code>
|
||||
</li>
|
||||
<li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
|
||||
<li>
|
||||
Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
|
||||
<code>docker build --platform linux/amd64 -t crawl4ai .</code>
|
||||
</li>
|
||||
<li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
|
||||
</ol>
|
||||
<p>
|
||||
For more detailed instructions and advanced configuration options, please refer to the 📚
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
<a
|
||||
href="https://discord.gg/your-invite-link"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>💬 Discord</a
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
forced: false,
|
||||
extract_blocks: extractBlocks,
|
||||
};
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").classList.add("hidden");
|
||||
document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
// Update code examples dynamically
|
||||
// Update code examples dynamically
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: "your_api_token",
|
||||
})}' http://localhost:8000/crawl`;
|
||||
|
||||
document.getElementById(
|
||||
"python-code"
|
||||
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
document.getElementById("result").classList.remove("hidden");
|
||||
document.getElementById("code_help").classList.remove("hidden");
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".code-tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map(url => url.trim());
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
|
||||
const data = {
|
||||
urls: urls,
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
forced: false,
|
||||
extract_blocks: extractBlocks
|
||||
};
|
||||
|
||||
localStorage.setItem("api_token", apiToken);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").classList.add("hidden");
|
||||
document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios.post("/crawl", data)
|
||||
.then(response => {
|
||||
const taskId = response.data.task_id;
|
||||
pollTaskStatus(taskId);
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error during fetch:', error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
function pollTaskStatus(taskId) {
|
||||
axios.get(`/task/${taskId}`)
|
||||
.then(response => {
|
||||
const task = response.data;
|
||||
if (task.status === 'done') {
|
||||
displayResults(task.results[0]);
|
||||
} else if (task.status === 'pending') {
|
||||
setTimeout(() => pollTaskStatus(taskId), 2000); // Poll every 2 seconds
|
||||
} else {
|
||||
console.error('Task failed:', task.error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error polling task status:', error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,36 +0,0 @@
|
||||
<section class="hero bg-zinc-900 py-8 px-20 text-zinc-400">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||
benefit of all. 🤝💪
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-zinc-900 text-zinc-400 py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-zinc-400 hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-zinc-400 hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
@@ -1,174 +0,0 @@
|
||||
<section id="how-to-guide" class="content-section">
|
||||
<h1 class="text-2xl font-bold">How to Guide</h1>
|
||||
<div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🌟
|
||||
<strong
|
||||
>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling
|
||||
fun!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">
|
||||
First Step: Create an instance of WebCrawler and call the
|
||||
<code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
<div class="">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
|
||||
<div class="bg-red-900 p-2 text-zinc-50">
|
||||
⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set <code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
|
||||
</div>
|
||||
</div>
|
||||
<div class="">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content
|
||||
in the response. By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||
</div>
|
||||
<!-- Step 3.5 Screenshot -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📸
|
||||
<strong>Let's take a screenshot of the page!</strong>
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
screenshot=True
|
||||
)
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))</code></pre>
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🤖
|
||||
<strong
|
||||
>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📜
|
||||
<strong
|
||||
>Let's make it even more interesting: LLMExtractionStrategy with
|
||||
instructions!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎯
|
||||
<strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🖱️
|
||||
<strong
|
||||
>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
|
||||
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth
|
||||
and crawl the web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
@@ -1,65 +0,0 @@
|
||||
<section id="installation" class="content-section active">
|
||||
<h1 class="text-2xl font-bold">Installation 💻</h1>
|
||||
<p class="mb-4">
|
||||
There are three ways to use Crawl4AI:
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="">
|
||||
As a library
|
||||
</li>
|
||||
<li class="">
|
||||
As a local server (Docker)
|
||||
</li>
|
||||
<li class="">
|
||||
As a Google Colab notebook. <a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</li>
|
||||
</p>
|
||||
|
||||
|
||||
<p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-4">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>virtualenv venv
|
||||
source venv/bin/activate
|
||||
pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>crawl4ai-download-models</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .[all]
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="">
|
||||
Use docker to run the local server:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">docker build -t crawl4ai .
|
||||
# docker build --platform linux/amd64 -t crawl4ai . For Mac users
|
||||
docker run -d -p 8000:80 crawl4ai</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</section>
|
||||
@@ -1,217 +0,0 @@
|
||||
<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
|
||||
<div class="container mx-auto ">
|
||||
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
||||
<div class="flex gap-4">
|
||||
<div class="flex flex-col flex-1 gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
|
||||
<select
|
||||
id="threshold"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="1">1</option>
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col flex-1">
|
||||
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
|
||||
<input
|
||||
type="text"
|
||||
id="css-selector"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-lime-700"
|
||||
placeholder="CSS Selector (e.g. .content, #main, article)"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Extraction Strategy</label
|
||||
>
|
||||
<select
|
||||
id="extraction-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="NoExtractionStrategy" selected>NoExtractionStrategy</option>
|
||||
<option value="CosineStrategy">CosineStrategy</option>
|
||||
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Chunking Strategy</label
|
||||
>
|
||||
<select
|
||||
id="chunking-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="RegexChunking">RegexChunking</option>
|
||||
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
|
||||
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
|
||||
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
|
||||
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div id = "llm_settings" class="flex gap-2 hidden hidden">
|
||||
<div class="flex flex-col">
|
||||
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
|
||||
>Provider Model</label
|
||||
>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="groq/mixtral-8x7b-32768">groq/mixtral-8x7b-32768</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="openai/gpt-4o">gpt-4o</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col flex-1">
|
||||
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
|
||||
placeholder="Enter Groq API token"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<!-- Add two textarea one for getting Keyword Filter and another one Instruction, make both grow whole with-->
|
||||
<div id = "semantic_filter_div" class="flex flex-col flex-1 hidden">
|
||||
<label for="keyword-filter" class="text-lime-500 font-bold text-xs">Keyword Filter</label>
|
||||
<textarea
|
||||
id="semantic_filter"
|
||||
rows="3"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
|
||||
placeholder="Enter keywords for CosineStrategy to narrow down the content."
|
||||
></textarea>
|
||||
</div>
|
||||
<div id = "instruction_div" class="flex flex-col flex-1 hidden">
|
||||
<label for="instruction" class="text-lime-500 font-bold text-xs">Instruction</label>
|
||||
<textarea
|
||||
id="instruction"
|
||||
rows="3"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
|
||||
placeholder="Enter instruction for the LLMEstrategy to instruct the model."
|
||||
></textarea>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-3">
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="bypass-cache-checkbox" />
|
||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="screenshot-checkbox" checked />
|
||||
<label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2 hidden">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" />
|
||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="loading" class="hidden">
|
||||
<p class="text-white">Loading... Please wait.</p>
|
||||
</div>
|
||||
<div id="result" class="flex-1 overflow-x-auto">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
||||
JSON
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="cleaned-html"
|
||||
>
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
||||
Markdown
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
|
||||
Medias
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
|
||||
Screenshot
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="code_help" class="flex-1 overflow-x-auto">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
||||
cURL
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="library"
|
||||
>
|
||||
Python
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="python"
|
||||
>
|
||||
REST API
|
||||
</button>
|
||||
<!-- <button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="nodejs"
|
||||
>
|
||||
Node.js
|
||||
</button> -->
|
||||
</div>
|
||||
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex relative overflow-x-auto">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="library-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
434
pages/tmp.html
434
pages/tmp.html
@@ -1,434 +0,0 @@
|
||||
<div class="w-3/4 p-4">
|
||||
<section id="installation" class="content-section active">
|
||||
<h1 class="text-2xl font-bold">Installation 💻</h1>
|
||||
<p class="mb-4">There are three ways to use Crawl4AI:</p>
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="">As a library</li>
|
||||
<li class="">As a local server (Docker)</li>
|
||||
<li class="">
|
||||
As a Google Colab notebook.
|
||||
<a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</li>
|
||||
<p></p>
|
||||
|
||||
<p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-4">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="hljs language-bash">pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="language-python bash hljs">virtualenv venv
|
||||
source venv/<span class="hljs-built_in">bin</span>/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="">
|
||||
Use docker to run the local server:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="language-python bash hljs">docker build -t crawl4ai .
|
||||
<span class="hljs-comment"># docker build --platform linux/amd64 -t crawl4ai . For Mac users</span>
|
||||
docker run -d -p <span class="hljs-number">8000</span>:<span class="hljs-number">80</span> crawl4ai</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</ol>
|
||||
</section>
|
||||
<section id="how-to-guide" class="content-section">
|
||||
<h1 class="text-2xl font-bold">How to Guide</h1>
|
||||
<div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🌟
|
||||
<strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
|
||||
</div>
|
||||
<div class="">
|
||||
First Step: Create an instance of WebCrawler and call the
|
||||
<code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
|
||||
</div>
|
||||
<div class="">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, bypass_cache=<span class="hljs-literal">True</span>)</code></pre>
|
||||
<div class="bg-red-900 p-2 text-zinc-50">
|
||||
⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies
|
||||
for the same URL. Otherwise, the cached result will be returned. You can also set
|
||||
<code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
|
||||
</div>
|
||||
</div>
|
||||
<div class="">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, include_raw_html=<span class="hljs-literal">False</span>)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response.
|
||||
By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">crawler.always_by_pass_cache = <span class="hljs-literal">True</span></code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
chunking_strategy=RegexChunking(patterns=[<span class="hljs-string">"\n\n"</span>])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=<span class="hljs-number">20</span>, max_dist=<span class="hljs-number">0.2</span>, linkage_method=<span class="hljs-string">"ward"</span>, top_k=<span class="hljs-number">3</span>)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🤖
|
||||
<strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=LLMExtractionStrategy(provider=<span class="hljs-string">"openai/gpt-4o"</span>, api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📜
|
||||
<strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=<span class="hljs-string">"openai/gpt-4o"</span>,
|
||||
api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>),
|
||||
instruction=<span class="hljs-string">"I am interested in only financial news"</span>
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎯
|
||||
<strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
css_selector=<span class="hljs-string">"h2"</span>
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🖱️
|
||||
<strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
|
||||
</div>
|
||||
<div class="">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">js_code = <span class="hljs-string">"""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""</span>
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=<span class="hljs-literal">True</span>)
|
||||
result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the
|
||||
web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="chunking-strategies" class="content-section">
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>RegexChunking</h3>
|
||||
<p>
|
||||
<code>RegexChunking</code> is a text chunking strategy that splits a given text into smaller parts
|
||||
using regular expressions. This is useful for preparing large texts for processing by language
|
||||
models, ensuring they are divided into manageable segments.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>patterns</code> (list, optional): A list of regular expression patterns used to split the
|
||||
text. Default is to split by double newlines (<code>['\n\n']</code>).
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>NlpSentenceChunking</h3>
|
||||
<p>
|
||||
<code>NlpSentenceChunking</code> uses a natural language processing model to chunk a given text into
|
||||
sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
None.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = NlpSentenceChunking()
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>TopicSegmentationChunking</h3>
|
||||
<p>
|
||||
<code>TopicSegmentationChunking</code> uses the TextTiling algorithm to segment a given text into
|
||||
topic-based chunks. This method identifies thematic boundaries in the text.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>num_keywords</code> (int, optional): The number of keywords to extract for each topic
|
||||
segment. Default is <code>3</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = TopicSegmentationChunking(num_keywords=3)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>FixedLengthWordChunking</h3>
|
||||
<p>
|
||||
<code>FixedLengthWordChunking</code> splits a given text into chunks of fixed length, based on the
|
||||
number of words.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>chunk_size</code> (int, optional): The number of words in each chunk. Default is
|
||||
<code>100</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = FixedLengthWordChunking(chunk_size=100)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>SlidingWindowChunking</h3>
|
||||
<p>
|
||||
<code>SlidingWindowChunking</code> uses a sliding window approach to chunk a given text. Each chunk
|
||||
has a fixed length, and the window slides by a specified step size.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>window_size</code> (int, optional): The number of words in each chunk. Default is
|
||||
<code>100</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>step</code> (int, optional): The number of words to slide the window. Default is
|
||||
<code>50</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = SlidingWindowChunking(window_size=100, step=50)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="extraction-strategies" class="content-section">
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>NoExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>NoExtractionStrategy</code> is a basic extraction strategy that returns the entire HTML
|
||||
content without any modification. It is useful for cases where no specific extraction is required.
|
||||
Only clean html, and amrkdown.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<p>None.</p>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = NoExtractionStrategy()
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>LLMExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>LLMExtractionStrategy</code> uses a Language Model (LLM) to extract meaningful blocks or
|
||||
chunks from the given HTML content. This strategy leverages an external provider for language model
|
||||
completions.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>provider</code> (str, optional): The provider to use for the language model completions.
|
||||
Default is <code>DEFAULT_PROVIDER</code> (e.g., openai/gpt-4).
|
||||
</li>
|
||||
<li>
|
||||
<code>api_token</code> (str, optional): The API token for the provider. If not provided, it will
|
||||
try to load from the environment variable <code>OPENAI_API_KEY</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>instruction</code> (str, optional): An instruction to guide the LLM on how to perform the
|
||||
extraction. This allows users to specify the type of data they are interested in or set the tone
|
||||
of the response. Default is <code>None</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
<p>
|
||||
By providing clear instructions, users can tailor the extraction process to their specific needs,
|
||||
enhancing the relevance and utility of the extracted content.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>CosineStrategy</h3>
|
||||
<p>
|
||||
<code>CosineStrategy</code> uses hierarchical clustering based on cosine similarity to extract
|
||||
clusters of text from the given HTML content. This strategy is suitable for identifying related
|
||||
content sections.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>semantic_filter</code> (str, optional): A string containing keywords for filtering relevant
|
||||
documents before clustering. If provided, documents are filtered based on their cosine
|
||||
similarity to the keyword filter embedding. Default is <code>None</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>word_count_threshold</code> (int, optional): Minimum number of words per cluster. Default
|
||||
is <code>20</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>max_dist</code> (float, optional): The maximum cophenetic distance on the dendrogram to
|
||||
form clusters. Default is <code>0.2</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>linkage_method</code> (str, optional): The linkage method for hierarchical clustering.
|
||||
Default is <code>'ward'</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>top_k</code> (int, optional): Number of top categories to extract. Default is
|
||||
<code>3</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>model_name</code> (str, optional): The model name for embedding generation. Default is
|
||||
<code>'BAAI/bge-small-en-v1.5'</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
<h4>Cosine Similarity Filtering</h4>
|
||||
<p>
|
||||
When a <code>semantic_filter</code> is provided, the <code>CosineStrategy</code> applies an
|
||||
embedding-based filtering process to select relevant documents before performing hierarchical
|
||||
clustering.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>TopicExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>TopicExtractionStrategy</code> uses the TextTiling algorithm to segment the HTML content into
|
||||
topics and extracts keywords for each segment. This strategy is useful for identifying and
|
||||
summarizing thematic content.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>num_keywords</code> (int, optional): Number of keywords to represent each topic segment.
|
||||
Default is <code>3</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = TopicExtractionStrategy(num_keywords=3)
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
@@ -13,8 +13,8 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__f
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
|
||||
@dataclass
|
||||
|
||||
165
tests/async/test_markdown_genertor.py
Normal file
165
tests/async/test_markdown_genertor.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# ## Issue #236
|
||||
# - **Last Updated:** 2024-11-11 01:42:14
|
||||
# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236)
|
||||
# - **State:** open
|
||||
|
||||
import os, sys, time
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy
|
||||
|
||||
# Get current directory
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
|
||||
"""Helper function to print test results."""
|
||||
print(f"\n{'='*20} {name} {'='*20}")
|
||||
print(f"Execution time: {execution_time:.4f} seconds")
|
||||
|
||||
|
||||
# Save markdown to files
|
||||
for key, content in result.items():
|
||||
if isinstance(content, str):
|
||||
with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
|
||||
f.write(content)
|
||||
|
||||
# # Print first few lines of each markdown version
|
||||
# for key, content in result.items():
|
||||
# if isinstance(content, str):
|
||||
# preview = '\n'.join(content.split('\n')[:3])
|
||||
# print(f"\n{key} (first 3 lines):")
|
||||
# print(preview)
|
||||
# print(f"Total length: {len(content)} characters")
|
||||
|
||||
def test_basic_markdown_conversion():
|
||||
"""Test basic markdown conversion with links."""
|
||||
with open(__location__ + "/data/wikipedia.html", "r") as f:
|
||||
cleaned_html = f.read()
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
execution_time = time.perf_counter() - start_time
|
||||
|
||||
print_test_result("Basic Markdown Conversion", {
|
||||
'raw': result.raw_markdown,
|
||||
'with_citations': result.markdown_with_citations,
|
||||
'references': result.references_markdown
|
||||
}, execution_time)
|
||||
|
||||
# Basic assertions
|
||||
assert result.raw_markdown, "Raw markdown should not be empty"
|
||||
assert result.markdown_with_citations, "Markdown with citations should not be empty"
|
||||
assert result.references_markdown, "References should not be empty"
|
||||
assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
|
||||
assert "## References" in result.references_markdown, "Should contain references section"
|
||||
|
||||
def test_relative_links():
|
||||
"""Test handling of relative links with base URL."""
|
||||
markdown = """
|
||||
Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
|
||||
Also an [image](/images/test.png) and another [page](/wiki/Banana).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
|
||||
assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
|
||||
assert "https://example.com" in result.references_markdown
|
||||
assert "https://en.wikipedia.org/images/test.png" in result.references_markdown
|
||||
|
||||
def test_duplicate_links():
|
||||
"""Test handling of duplicate links."""
|
||||
markdown = """
|
||||
Here's a [link](/test) and another [link](/test) and a [different link](/other).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
# Count citations in markdown
|
||||
citations = result.markdown_with_citations.count("⟨1⟩")
|
||||
assert citations == 2, "Same link should use same citation number"
|
||||
|
||||
def test_link_descriptions():
|
||||
"""Test handling of link titles and descriptions."""
|
||||
markdown = """
|
||||
Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
assert "Test Title" in result.references_markdown, "Link title should be in references"
|
||||
assert "link with description" in result.references_markdown, "Link text should be in references"
|
||||
|
||||
def test_performance_large_document():
|
||||
"""Test performance with large document."""
|
||||
with open(__location__ + "/data/wikipedia.md", "r") as f:
|
||||
markdown = f.read()
|
||||
|
||||
# Test with multiple iterations
|
||||
iterations = 5
|
||||
times = []
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
|
||||
for i in range(iterations):
|
||||
start_time = time.perf_counter()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://en.wikipedia.org"
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
times.append(end_time - start_time)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
print(f"\n{'='*20} Performance Test {'='*20}")
|
||||
print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
|
||||
print(f"Min time: {min(times):.4f} seconds")
|
||||
print(f"Max time: {max(times):.4f} seconds")
|
||||
|
||||
def test_image_links():
|
||||
"""Test handling of image links."""
|
||||
markdown = """
|
||||
Here's an  and another .
|
||||
And a regular [link](/page).
|
||||
"""
|
||||
|
||||
generator = DefaultMarkdownGenerationStrategy()
|
||||
result = generator.generate_markdown(
|
||||
cleaned_html=markdown,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
|
||||
assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
|
||||
assert "Image Title" in result.references_markdown, "Image title should be in references"
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Running markdown generation strategy tests...")
|
||||
|
||||
test_basic_markdown_conversion()
|
||||
test_relative_links()
|
||||
test_duplicate_links()
|
||||
test_link_descriptions()
|
||||
test_performance_large_document()
|
||||
test_image_links()
|
||||
|
||||
Reference in New Issue
Block a user