Compare commits

..

44 Commits

Author SHA1 Message Date
Unclecode
b1ac4fe023 Merge branch 'main' into ssh-server 2024-12-12 12:25:26 +00:00
Unclecode
a3c92141a1 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-12 12:25:01 +00:00
Unclecode
3fd777dd6f remove crawl endpoints 2024-12-12 12:24:13 +00:00
Unclecode
d7200138a0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-08 12:06:53 +00:00
Unclecode
be37abe05a Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-04 12:31:45 +00:00
Unclecode
90ba51b52f fix(mkdocs): correct typo in Docker Deployment navigation entry 2024-12-04 12:31:41 +00:00
Unclecode
11721eb0ce Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-11-05 13:02:59 +00:00
Unclecode
1222e456fb Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-11-05 12:58:30 +00:00
Unclecode
e8aaa57cb2 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-30 12:59:34 +00:00
Unclecode
a661b3173d Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-30 12:47:07 +00:00
Unclecode
b781b6df96 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-27 11:42:23 +00:00
Unclecode
14e537fdd3 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-08-04 06:57:16 +00:00
Unclecode
64b33af0e0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-08-02 08:04:54 +00:00
Unclecode
1afcdb6996 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 12:24:13 +00:00
Unclecode
ca625b3152 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 12:02:19 +00:00
Unclecode
6521b4745f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 08:35:49 +00:00
Unclecode
241862bfe6 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-03 07:27:37 +00:00
Unclecode
f2491b6c1a Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-29 16:34:15 +00:00
Unclecode
886622cb1e Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-29 16:23:44 +00:00
Unclecode
13dc254438 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-26 07:35:06 +00:00
Unclecode
096929153f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-26 05:45:25 +00:00
Unclecode
7e95c38acb Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-24 14:40:48 +00:00
Unclecode
c697bf23e4 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 16:37:27 +00:00
Unclecode
b951d34ed0 chore: Update fetch URL to use HTTPS 2024-06-22 16:37:21 +00:00
Unclecode
c8a10dc455 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:54:41 +00:00
Unclecode
9e0ded8da0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:41:52 +00:00
Unclecode
48c27899b7 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:38:14 +00:00
Unclecode
3c32b0abed Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-21 09:58:17 +00:00
Unclecode
a215ec08d6 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-19 10:51:31 +00:00
Unclecode
5d3fef45f7 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-18 12:02:29 +00:00
Unclecode
77df6db453 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 10:38:10 +00:00
Unclecode
2124652327 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 10:07:30 +00:00
Unclecode
255bde70c9 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 08:53:54 +00:00
Unclecode
04808b5dc9 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 12:44:41 +00:00
Unclecode
b3a150f3d1 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:26:43 +00:00
Unclecode
de80a2da09 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:25:49 +00:00
Unclecode
df4cda8322 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:24:46 +00:00
Unclecode
7717a3b948 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:19:37 +00:00
Unclecode
a4a6b2075f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:18:19 +00:00
Unclecode
4010558885 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:12:32 +00:00
Unclecode
b0cf5076da Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:09:25 +00:00
Unclecode
0d6e9e37ca Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:06:56 +00:00
Unclecode
9b0f71ba88 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 07:56:00 +00:00
Unclecode
6ddccc144c chore: Bump version to 0.2.2 in setup.py 2024-05-19 16:19:40 +00:00
15 changed files with 398 additions and 167 deletions

22
.do/deploy.template.yaml Normal file
View File

@@ -0,0 +1,22 @@
spec:
name: crawl4ai
services:
- name: crawl4ai
git:
branch: 0.3.74
repo_clone_url: https://github.com/unclecode/crawl4ai.git
dockerfile_path: Dockerfile
http_port: 11235
instance_count: 1
instance_size_slug: professional-xs
health_check:
http_path: /health
envs:
- key: INSTALL_TYPE
value: "basic"
- key: PYTHON_VERSION
value: "3.10"
- key: ENABLE_GPU
value: "false"
routes:
- path: /

View File

@@ -1,2 +1 @@
include requirements.txt
recursive-include crawl4ai/js_snippet *.js
include requirements.txt

View File

@@ -1,4 +1,4 @@
# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.4.22"
__version__ = "0.4.2"

View File

@@ -7,7 +7,6 @@ from .config import (
from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
class BrowserConfig:
"""
@@ -270,7 +269,6 @@ class CrawlerRunConfig:
word_count_threshold: int = MIN_WORD_THRESHOLD ,
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
markdown_generator : MarkdownGenerationStrategy = None,
content_filter=None,
cache_mode=None,
session_id: str = None,
@@ -311,7 +309,6 @@ class CrawlerRunConfig:
self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.content_filter = content_filter
self.cache_mode = cache_mode
self.session_id = session_id
@@ -367,7 +364,6 @@ class CrawlerRunConfig:
word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"),
cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"),

View File

@@ -7,8 +7,7 @@ from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
# from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from contextlib import nullcontext, asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
@@ -16,7 +15,6 @@ from .content_filter_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -134,12 +132,17 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
@asynccontextmanager
async def nullcontext(self):
yield
async def awarmup(self):
"""Initialize the crawler with warm-up sequence."""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True
@asynccontextmanager
async def nullcontext(self):
"""异步空上下文管理器"""
@@ -320,8 +323,7 @@ class AsyncWebCrawler:
config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose,
**kwargs
verbose=config.verbose
)
# Set response data
@@ -422,8 +424,7 @@ class AsyncWebCrawler:
css_selector=config.css_selector,
only_text=config.only_text,
image_description_min_word_threshold=config.image_description_min_word_threshold,
content_filter=config.content_filter,
**kwargs
content_filter=config.content_filter
)
if result is None:
@@ -434,29 +435,16 @@ class AsyncWebCrawler:
except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
# Extract results
markdown_v2 = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
# html2text_options=kwargs.get('html2text', {})
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion
self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms",

View File

@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
# markdown_content = self._generate_markdown_content(
# cleaned_html=cleaned_html,
# html=html,
# url=url,
# success=success,
# **kwargs
# )
markdown_content = self._generate_markdown_content(
cleaned_html=cleaned_html,
html=html,
url=url,
success=success,
**kwargs
)
return {
# **markdown_content,
**markdown_content,
'cleaned_html': cleaned_html,
'success': success,
'media': media,

View File

@@ -1,40 +1,41 @@
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
import asyncio
from pydantic import BaseModel, Field
url = r'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
from crawl4ai import AsyncWebCrawler
result = crawler.run(
url=url,
word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy(
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their "\
"fees for input and output tokens. Make sure not to miss anything in the entire content. "\
'One extracted model JSON format should look like this: '\
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
),
bypass_cache=True,
)
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy(
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their " \
"fees for input and output tokens. Make sure not to miss anything in the entire content. " \
'One extracted model JSON format should look like this: ' \
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
),
model_fees = json.loads(result.extracted_content)
)
print("Success:", result.success)
model_fees = json.loads(result.extracted_content)
print(len(model_fees))
print(len(model_fees))
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
asyncio.run(main())
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)

View File

@@ -142,7 +142,6 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1,
page_timeout = 80000,
extraction_strategy=LLMExtractionStrategy(
provider=provider,
api_token=api_token,
@@ -498,21 +497,21 @@ async def main():
# Advanced examples
# await extract_structured_data_using_css_extractor()
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
# Browser comparisons
# await crawl_custom_browser_type()
await crawl_custom_browser_type()
# Performance testing
# await speed_comparison()
# Screenshot example
# await capture_and_save_screenshot(
# "https://www.example.com",
# os.path.join(__location__, "tmp/example_screenshot.jpg")
# )
await capture_and_save_screenshot(
"https://www.example.com",
os.path.join(__location__, "tmp/example_screenshot.jpg")
)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -239,10 +239,8 @@ async def crawl_dynamic_content_pages_method_1():
all_commits = []
js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
})();
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
for page in range(3): # Crawl 3 pages
@@ -606,14 +604,14 @@ async def fit_markdown_remove_overlay():
async def main():
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# # await use_proxy()
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
# await extract_structured_data_using_css_extractor()
await simple_crawl()
await simple_example_with_running_js_code()
await simple_example_with_css_selector()
# await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor()
# LLM extraction examples
# await extract_structured_data_using_llm()

231
docs/md/demo.md Normal file
View File

@@ -0,0 +1,231 @@
# Interactive Demo for Crowler
<div id="demo">
<form id="crawlForm" class="terminal-form">
<fieldset>
<legend>Enter URL and Options</legend>
<div class="form-group">
<label for="url">Enter URL:</label>
<input type="text" id="url" name="url" required>
</div>
<div class="form-group">
<label for="screenshot">Get Screenshot:</label>
<input type="checkbox" id="screenshot" name="screenshot">
</div>
<div class="form-group">
<button class="btn btn-default" type="submit">Submit</button>
</div>
</fieldset>
</form>
<div id="loading" class="loading-message">
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
</div>
<section id="response" class="response-section">
<h2>Response</h2>
<div class="tabs">
<ul class="tab-list">
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
<li class="tab-item" onclick="showTab('media')">Media</li>
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
</ul>
<div class="tab-content" id="tab-markdown">
<header>
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
</div>
</header>
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
</div>
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
</div>
</header>
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
</div>
<div class="tab-content" id="tab-media" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
</div>
</header>
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-extractedContent" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
</div>
</header>
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-screenshot" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
</div>
</header>
<pre><img id="screenshotContent" /></pre>
</div>
<div class="tab-content" id="tab-pythonCode" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
</div>
</header>
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
</div>
</div>
</section>
<div id="error" class="error-message" style="display: none; margin-top:1em;">
<div class="terminal-alert terminal-alert-error"></div>
</div>
<script>
function showTab(tabId) {
const tabs = document.querySelectorAll('.tab-content');
tabs.forEach(tab => tab.style.display = 'none');
document.getElementById(`tab-${tabId}`).style.display = 'block';
}
function redo(codeBlock, codeText){
codeBlock.classList.remove('hljs');
codeBlock.removeAttribute('data-highlighted');
// Set new code and re-highlight
codeBlock.textContent = codeText;
hljs.highlightBlock(codeBlock);
}
function copyToClipboard(elementId) {
const content = document.getElementById(elementId).textContent;
navigator.clipboard.writeText(content).then(() => {
alert('Copied to clipboard');
});
}
function downloadContent(elementId, filename) {
const content = document.getElementById(elementId).textContent;
const blob = new Blob([content], { type: 'text/plain' });
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
}
function downloadImage(elementId, filename) {
const content = document.getElementById(elementId).src;
const a = document.createElement('a');
a.style.display = 'none';
a.href = content;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
document.getElementById('crawlForm').addEventListener('submit', function(event) {
event.preventDefault();
document.getElementById('loading').style.display = 'block';
document.getElementById('response').style.display = 'none';
const url = document.getElementById('url').value;
const screenshot = document.getElementById('screenshot').checked;
const data = {
urls: [url],
bypass_cache: false,
word_count_threshold: 5,
screenshot: screenshot
};
fetch('https://crawl4ai.com/crawl', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
})
.then(response => {
if (!response.ok) {
if (response.status === 429) {
return response.json().then(err => {
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
});
}
throw new Error('Network response was not ok');
}
return response.json();
})
.then(data => {
data = data.results[0]; // Only one URL is requested
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
redo(document.getElementById('markdownContent'), data.markdown);
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
redo(document.getElementById('extractedContentContent'), data.extracted_content);
if (screenshot) {
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
}
const pythonCode = `
from crawl4ai.web_crawler import WebCrawler
crawler = WebCrawler()
crawler.warmup()
result = crawler.run(
url='${url}',
screenshot=${screenshot}
)
print(result)
`;
redo(document.getElementById('pythonCode'), pythonCode);
document.getElementById('error').style.display = 'none';
})
.catch(error => {
document.getElementById('loading').style.display = 'none';
document.getElementById('error').style.display = 'block';
let errorMessage = 'An unexpected error occurred. Please try again later.';
if (error.status === 429) {
const details = error.details;
if (details.retry_after) {
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
} else if (details.reset_at) {
const resetTime = new Date(details.reset_at);
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
} else {
errorMessage = `Rate limit exceeded. Please try again later.`;
}
} else if (error.message) {
errorMessage = error.message;
}
document.querySelector('#error .terminal-alert').textContent = errorMessage;
});
});
</script>
</div>

View File

@@ -99,7 +99,7 @@ async def main():
remove_overlay_elements=True,
# Cache control
cache_mode=CacheMode.ENABLED # Use cache if available
cache_mode=CacheMode.ENABLE # Use cache if available
)
if result.success:

158
main.py
View File

@@ -380,97 +380,97 @@ def read_root():
return {"message": "Crawl4AI API service is running"}
@app.post("/crawl", dependencies=[Depends(verify_token)])
async def crawl(request: CrawlRequest) -> Dict[str, str]:
task_id = await crawler_service.submit_task(request)
return {"task_id": task_id}
# @app.post("/crawl", dependencies=[Depends(verify_token)])
# async def crawl(request: CrawlRequest) -> Dict[str, str]:
# task_id = await crawler_service.submit_task(request)
# return {"task_id": task_id}
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
async def get_task_status(task_id: str):
task_info = crawler_service.task_manager.get_task(task_id)
if not task_info:
raise HTTPException(status_code=404, detail="Task not found")
# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
# async def get_task_status(task_id: str):
# task_info = crawler_service.task_manager.get_task(task_id)
# if not task_info:
# raise HTTPException(status_code=404, detail="Task not found")
response = {
"status": task_info.status,
"created_at": task_info.created_at,
}
# response = {
# "status": task_info.status,
# "created_at": task_info.created_at,
# }
if task_info.status == TaskStatus.COMPLETED:
# Convert CrawlResult to dict for JSON response
if isinstance(task_info.result, list):
response["results"] = [result.dict() for result in task_info.result]
else:
response["result"] = task_info.result.dict()
elif task_info.status == TaskStatus.FAILED:
response["error"] = task_info.error
# if task_info.status == TaskStatus.COMPLETED:
# # Convert CrawlResult to dict for JSON response
# if isinstance(task_info.result, list):
# response["results"] = [result.dict() for result in task_info.result]
# else:
# response["result"] = task_info.result.dict()
# elif task_info.status == TaskStatus.FAILED:
# response["error"] = task_info.error
return response
# return response
@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
task_id = await crawler_service.submit_task(request)
# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
# task_id = await crawler_service.submit_task(request)
# Wait up to 60 seconds for task completion
for _ in range(60):
task_info = crawler_service.task_manager.get_task(task_id)
if not task_info:
raise HTTPException(status_code=404, detail="Task not found")
# # Wait up to 60 seconds for task completion
# for _ in range(60):
# task_info = crawler_service.task_manager.get_task(task_id)
# if not task_info:
# raise HTTPException(status_code=404, detail="Task not found")
if task_info.status == TaskStatus.COMPLETED:
# Return same format as /task/{task_id} endpoint
if isinstance(task_info.result, list):
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
return {"status": task_info.status, "result": task_info.result.dict()}
# if task_info.status == TaskStatus.COMPLETED:
# # Return same format as /task/{task_id} endpoint
# if isinstance(task_info.result, list):
# return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
# return {"status": task_info.status, "result": task_info.result.dict()}
if task_info.status == TaskStatus.FAILED:
raise HTTPException(status_code=500, detail=task_info.error)
# if task_info.status == TaskStatus.FAILED:
# raise HTTPException(status_code=500, detail=task_info.error)
await asyncio.sleep(1)
# await asyncio.sleep(1)
# If we get here, task didn't complete within timeout
raise HTTPException(status_code=408, detail="Task timed out")
# # If we get here, task didn't complete within timeout
# raise HTTPException(status_code=408, detail="Task timed out")
@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
try:
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
# try:
# crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
# extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
try:
if isinstance(request.urls, list):
results = await crawler.arun_many(
urls=[str(url) for url in request.urls],
extraction_strategy=extraction_strategy,
js_code=request.js_code,
wait_for=request.wait_for,
css_selector=request.css_selector,
screenshot=request.screenshot,
magic=request.magic,
cache_mode=request.cache_mode,
session_id=request.session_id,
**request.extra,
)
return {"results": [result.dict() for result in results]}
else:
result = await crawler.arun(
url=str(request.urls),
extraction_strategy=extraction_strategy,
js_code=request.js_code,
wait_for=request.wait_for,
css_selector=request.css_selector,
screenshot=request.screenshot,
magic=request.magic,
cache_mode=request.cache_mode,
session_id=request.session_id,
**request.extra,
)
return {"result": result.dict()}
finally:
await crawler_service.crawler_pool.release(crawler)
except Exception as e:
logger.error(f"Error in direct crawl: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
# try:
# if isinstance(request.urls, list):
# results = await crawler.arun_many(
# urls=[str(url) for url in request.urls],
# extraction_strategy=extraction_strategy,
# js_code=request.js_code,
# wait_for=request.wait_for,
# css_selector=request.css_selector,
# screenshot=request.screenshot,
# magic=request.magic,
# cache_mode=request.cache_mode,
# session_id=request.session_id,
# **request.extra,
# )
# return {"results": [result.dict() for result in results]}
# else:
# result = await crawler.arun(
# url=str(request.urls),
# extraction_strategy=extraction_strategy,
# js_code=request.js_code,
# wait_for=request.wait_for,
# css_selector=request.css_selector,
# screenshot=request.screenshot,
# magic=request.magic,
# cache_mode=request.cache_mode,
# session_id=request.session_id,
# **request.extra,
# )
# return {"result": result.dict()}
# finally:
# await crawler_service.crawler_pool.release(crawler)
# except Exception as e:
# logger.error(f"Error in direct crawl: {str(e)}")
# raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():

View File

@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
nav:
- Home: 'index.md'
- 'Installation': 'basic/installation.md'
- 'Docker Deplotment': 'basic/docker-deploymeny.md'
- 'Docker Deployment': 'basic/docker-deploymeny.md'
- 'Quick Start': 'basic/quickstart.md'
- Changelog & Blog:
- 'Blog Home': 'blog/index.md'

View File

@@ -57,9 +57,6 @@ setup(
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
package_data={
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
},
install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles
extras_require={