Compare commits

..

5 Commits

Author SHA1 Message Date
UncleCode
d97a075082 Delete a.md 2024-12-25 19:43:39 +08:00
Haopeng138
bacbeb3ed4 Fix #340 example llm_extraction (#358)
@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-24 19:56:07 +08:00
UncleCode
ed7bc1909c Bump version to 0.4.22 2024-12-15 19:49:38 +08:00
UncleCode
e9e5b5642d Fix js_snipprt issue 0.4.21
bump to 0.4.22
2024-12-15 19:49:30 +08:00
UncleCode
7524aa7b5e Feature: Add Markdown generation to CrawlerRunConfig
- Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`.
  - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`.
  - Updated version number to 0.4.21 in `__version__.py`.
2024-12-13 21:51:38 +08:00
13 changed files with 165 additions and 4588 deletions

View File

@@ -1 +1,2 @@
include requirements.txt include requirements.txt
recursive-include crawl4ai/js_snippet *.js

4214
a.md

File diff suppressed because it is too large Load Diff

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.2" __version__ = "0.4.22"

View File

@@ -7,6 +7,7 @@ from .config import (
from .user_agent_generator import UserAgentGenerator from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
class BrowserConfig: class BrowserConfig:
""" """
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
word_count_threshold: int = MIN_WORD_THRESHOLD , word_count_threshold: int = MIN_WORD_THRESHOLD ,
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
markdown_generator : MarkdownGenerationStrategy = None,
content_filter=None, content_filter=None,
cache_mode=None, cache_mode=None,
session_id: str = None, session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
self.word_count_threshold = word_count_threshold self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.content_filter = content_filter self.content_filter = content_filter
self.cache_mode = cache_mode self.cache_mode = cache_mode
self.session_id = session_id self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
word_count_threshold=kwargs.get("word_count_threshold", 200), word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"), extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"), chunking_strategy=kwargs.get("chunking_strategy"),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"), content_filter=kwargs.get("content_filter"),
cache_mode=kwargs.get("cache_mode"), cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"), session_id=kwargs.get("session_id"),

View File

@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Optional, List, Union from typing import Optional, List, Union
import json import json
import asyncio import asyncio
from contextlib import nullcontext, asynccontextmanager # from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
from .extraction_strategy import * from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -133,16 +135,11 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb): async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
@asynccontextmanager
async def nullcontext(self):
yield
async def awarmup(self): async def awarmup(self):
"""Initialize the crawler with warm-up sequence.""" """Initialize the crawler with warm-up sequence."""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True self.ready = True
@asynccontextmanager @asynccontextmanager
async def nullcontext(self): async def nullcontext(self):
"""异步空上下文管理器""" """异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
config=config, # Pass the config object instead of individual parameters config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data, screenshot=screenshot_data,
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose verbose=config.verbose,
**kwargs
) )
# Set response data # Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
css_selector=config.css_selector, css_selector=config.css_selector,
only_text=config.only_text, only_text=config.only_text,
image_description_min_word_threshold=config.image_description_min_word_threshold, image_description_min_word_threshold=config.image_description_min_word_threshold,
content_filter=config.content_filter content_filter=config.content_filter,
**kwargs
) )
if result is None: if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
except Exception as e: except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
# Extract results # Extract results
markdown_v2 = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", "")) fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", []) media = result.get("media", [])
links = result.get("links", []) links = result.get("links", [])
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
# html2text_options=kwargs.get('html2text', {})
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion # Log processing completion
self.logger.info( self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms", message="Processed {url:.50}... | Time: {timing}ms",

View File

@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
markdown_content = self._generate_markdown_content( # markdown_content = self._generate_markdown_content(
cleaned_html=cleaned_html, # cleaned_html=cleaned_html,
html=html, # html=html,
url=url, # url=url,
success=success, # success=success,
**kwargs # **kwargs
) # )
return { return {
**markdown_content, # **markdown_content,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': success, 'success': success,
'media': media, 'media': media,

View File

@@ -1,23 +1,21 @@
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
import asyncio
from pydantic import BaseModel, Field
url = r'https://openai.com/api/pricing/' url = r'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.") model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
result = crawler.run( from crawl4ai import AsyncWebCrawler
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url, url=url,
word_count_threshold=1, word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy( extraction_strategy= LLMExtractionStrategy(
@@ -30,12 +28,13 @@ result = crawler.run(
'One extracted model JSON format should look like this: ' \ 'One extracted model JSON format should look like this: ' \
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
), ),
bypass_cache=True,
) )
print("Success:", result.success)
model_fees = json.loads(result.extracted_content) model_fees = json.loads(result.extracted_content)
print(len(model_fees)) print(len(model_fees))
with open(".data/data.json", "w", encoding="utf-8") as f: with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content) f.write(result.extracted_content)
asyncio.run(main())

View File

@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
word_count_threshold=1, word_count_threshold=1,
page_timeout = 80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider=provider, provider=provider,
api_token=api_token, api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
# Advanced examples # Advanced examples
# await extract_structured_data_using_css_extractor() # await extract_structured_data_using_css_extractor()
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2() # await crawl_dynamic_content_pages_method_2()
# Browser comparisons # Browser comparisons
await crawl_custom_browser_type() # await crawl_custom_browser_type()
# Performance testing # Performance testing
# await speed_comparison() # await speed_comparison()
# Screenshot example # Screenshot example
await capture_and_save_screenshot( # await capture_and_save_screenshot(
"https://www.example.com", # "https://www.example.com",
os.path.join(__location__, "tmp/example_screenshot.jpg") # os.path.join(__location__, "tmp/example_screenshot.jpg")
) # )
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
all_commits = [] all_commits = []
js_next_page = """ js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]'); const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click(); if (button) button.click();
})();
""" """
for page in range(3): # Crawl 3 pages for page in range(3): # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
async def main(): async def main():
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await simple_crawl() # await simple_crawl()
await simple_example_with_running_js_code() # await simple_example_with_running_js_code()
await simple_example_with_css_selector() # await simple_example_with_css_selector()
# await use_proxy() # # await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor() # await extract_structured_data_using_css_extractor()
# LLM extraction examples # LLM extraction examples
# await extract_structured_data_using_llm() # await extract_structured_data_using_llm()

View File

@@ -1,231 +0,0 @@
# Interactive Demo for Crowler
<div id="demo">
<form id="crawlForm" class="terminal-form">
<fieldset>
<legend>Enter URL and Options</legend>
<div class="form-group">
<label for="url">Enter URL:</label>
<input type="text" id="url" name="url" required>
</div>
<div class="form-group">
<label for="screenshot">Get Screenshot:</label>
<input type="checkbox" id="screenshot" name="screenshot">
</div>
<div class="form-group">
<button class="btn btn-default" type="submit">Submit</button>
</div>
</fieldset>
</form>
<div id="loading" class="loading-message">
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
</div>
<section id="response" class="response-section">
<h2>Response</h2>
<div class="tabs">
<ul class="tab-list">
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
<li class="tab-item" onclick="showTab('media')">Media</li>
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
</ul>
<div class="tab-content" id="tab-markdown">
<header>
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
</div>
</header>
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
</div>
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
</div>
</header>
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
</div>
<div class="tab-content" id="tab-media" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
</div>
</header>
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-extractedContent" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
</div>
</header>
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-screenshot" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
</div>
</header>
<pre><img id="screenshotContent" /></pre>
</div>
<div class="tab-content" id="tab-pythonCode" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
</div>
</header>
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
</div>
</div>
</section>
<div id="error" class="error-message" style="display: none; margin-top:1em;">
<div class="terminal-alert terminal-alert-error"></div>
</div>
<script>
function showTab(tabId) {
const tabs = document.querySelectorAll('.tab-content');
tabs.forEach(tab => tab.style.display = 'none');
document.getElementById(`tab-${tabId}`).style.display = 'block';
}
function redo(codeBlock, codeText){
codeBlock.classList.remove('hljs');
codeBlock.removeAttribute('data-highlighted');
// Set new code and re-highlight
codeBlock.textContent = codeText;
hljs.highlightBlock(codeBlock);
}
function copyToClipboard(elementId) {
const content = document.getElementById(elementId).textContent;
navigator.clipboard.writeText(content).then(() => {
alert('Copied to clipboard');
});
}
function downloadContent(elementId, filename) {
const content = document.getElementById(elementId).textContent;
const blob = new Blob([content], { type: 'text/plain' });
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
}
function downloadImage(elementId, filename) {
const content = document.getElementById(elementId).src;
const a = document.createElement('a');
a.style.display = 'none';
a.href = content;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
document.getElementById('crawlForm').addEventListener('submit', function(event) {
event.preventDefault();
document.getElementById('loading').style.display = 'block';
document.getElementById('response').style.display = 'none';
const url = document.getElementById('url').value;
const screenshot = document.getElementById('screenshot').checked;
const data = {
urls: [url],
bypass_cache: false,
word_count_threshold: 5,
screenshot: screenshot
};
fetch('https://crawl4ai.com/crawl', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
})
.then(response => {
if (!response.ok) {
if (response.status === 429) {
return response.json().then(err => {
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
});
}
throw new Error('Network response was not ok');
}
return response.json();
})
.then(data => {
data = data.results[0]; // Only one URL is requested
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
redo(document.getElementById('markdownContent'), data.markdown);
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
redo(document.getElementById('extractedContentContent'), data.extracted_content);
if (screenshot) {
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
}
const pythonCode = `
from crawl4ai.web_crawler import WebCrawler
crawler = WebCrawler()
crawler.warmup()
result = crawler.run(
url='${url}',
screenshot=${screenshot}
)
print(result)
`;
redo(document.getElementById('pythonCode'), pythonCode);
document.getElementById('error').style.display = 'none';
})
.catch(error => {
document.getElementById('loading').style.display = 'none';
document.getElementById('error').style.display = 'block';
let errorMessage = 'An unexpected error occurred. Please try again later.';
if (error.status === 429) {
const details = error.details;
if (details.retry_after) {
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
} else if (details.reset_at) {
const resetTime = new Date(details.reset_at);
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
} else {
errorMessage = `Rate limit exceeded. Please try again later.`;
}
} else if (error.message) {
errorMessage = error.message;
}
document.querySelector('#error .terminal-alert').textContent = errorMessage;
});
});
</script>
</div>

158
main.py
View File

@@ -380,97 +380,97 @@ def read_root():
return {"message": "Crawl4AI API service is running"} return {"message": "Crawl4AI API service is running"}
# @app.post("/crawl", dependencies=[Depends(verify_token)]) @app.post("/crawl", dependencies=[Depends(verify_token)])
# async def crawl(request: CrawlRequest) -> Dict[str, str]: async def crawl(request: CrawlRequest) -> Dict[str, str]:
# task_id = await crawler_service.submit_task(request) task_id = await crawler_service.submit_task(request)
# return {"task_id": task_id} return {"task_id": task_id}
# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
# async def get_task_status(task_id: str): async def get_task_status(task_id: str):
# task_info = crawler_service.task_manager.get_task(task_id) task_info = crawler_service.task_manager.get_task(task_id)
# if not task_info: if not task_info:
# raise HTTPException(status_code=404, detail="Task not found") raise HTTPException(status_code=404, detail="Task not found")
# response = { response = {
# "status": task_info.status, "status": task_info.status,
# "created_at": task_info.created_at, "created_at": task_info.created_at,
# } }
# if task_info.status == TaskStatus.COMPLETED: if task_info.status == TaskStatus.COMPLETED:
# # Convert CrawlResult to dict for JSON response # Convert CrawlResult to dict for JSON response
# if isinstance(task_info.result, list): if isinstance(task_info.result, list):
# response["results"] = [result.dict() for result in task_info.result] response["results"] = [result.dict() for result in task_info.result]
# else: else:
# response["result"] = task_info.result.dict() response["result"] = task_info.result.dict()
# elif task_info.status == TaskStatus.FAILED: elif task_info.status == TaskStatus.FAILED:
# response["error"] = task_info.error response["error"] = task_info.error
# return response return response
# @app.post("/crawl_sync", dependencies=[Depends(verify_token)]) @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
# task_id = await crawler_service.submit_task(request) task_id = await crawler_service.submit_task(request)
# # Wait up to 60 seconds for task completion # Wait up to 60 seconds for task completion
# for _ in range(60): for _ in range(60):
# task_info = crawler_service.task_manager.get_task(task_id) task_info = crawler_service.task_manager.get_task(task_id)
# if not task_info: if not task_info:
# raise HTTPException(status_code=404, detail="Task not found") raise HTTPException(status_code=404, detail="Task not found")
# if task_info.status == TaskStatus.COMPLETED: if task_info.status == TaskStatus.COMPLETED:
# # Return same format as /task/{task_id} endpoint # Return same format as /task/{task_id} endpoint
# if isinstance(task_info.result, list): if isinstance(task_info.result, list):
# return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
# return {"status": task_info.status, "result": task_info.result.dict()} return {"status": task_info.status, "result": task_info.result.dict()}
# if task_info.status == TaskStatus.FAILED: if task_info.status == TaskStatus.FAILED:
# raise HTTPException(status_code=500, detail=task_info.error) raise HTTPException(status_code=500, detail=task_info.error)
# await asyncio.sleep(1) await asyncio.sleep(1)
# # If we get here, task didn't complete within timeout # If we get here, task didn't complete within timeout
# raise HTTPException(status_code=408, detail="Task timed out") raise HTTPException(status_code=408, detail="Task timed out")
# @app.post("/crawl_direct", dependencies=[Depends(verify_token)]) @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
# try: try:
# crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
# extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
# try: try:
# if isinstance(request.urls, list): if isinstance(request.urls, list):
# results = await crawler.arun_many( results = await crawler.arun_many(
# urls=[str(url) for url in request.urls], urls=[str(url) for url in request.urls],
# extraction_strategy=extraction_strategy, extraction_strategy=extraction_strategy,
# js_code=request.js_code, js_code=request.js_code,
# wait_for=request.wait_for, wait_for=request.wait_for,
# css_selector=request.css_selector, css_selector=request.css_selector,
# screenshot=request.screenshot, screenshot=request.screenshot,
# magic=request.magic, magic=request.magic,
# cache_mode=request.cache_mode, cache_mode=request.cache_mode,
# session_id=request.session_id, session_id=request.session_id,
# **request.extra, **request.extra,
# ) )
# return {"results": [result.dict() for result in results]} return {"results": [result.dict() for result in results]}
# else: else:
# result = await crawler.arun( result = await crawler.arun(
# url=str(request.urls), url=str(request.urls),
# extraction_strategy=extraction_strategy, extraction_strategy=extraction_strategy,
# js_code=request.js_code, js_code=request.js_code,
# wait_for=request.wait_for, wait_for=request.wait_for,
# css_selector=request.css_selector, css_selector=request.css_selector,
# screenshot=request.screenshot, screenshot=request.screenshot,
# magic=request.magic, magic=request.magic,
# cache_mode=request.cache_mode, cache_mode=request.cache_mode,
# session_id=request.session_id, session_id=request.session_id,
# **request.extra, **request.extra,
# ) )
# return {"result": result.dict()} return {"result": result.dict()}
# finally: finally:
# await crawler_service.crawler_pool.release(crawler) await crawler_service.crawler_pool.release(crawler)
# except Exception as e: except Exception as e:
# logger.error(f"Error in direct crawl: {str(e)}") logger.error(f"Error in direct crawl: {str(e)}")
# raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():

View File

@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
nav: nav:
- Home: 'index.md' - Home: 'index.md'
- 'Installation': 'basic/installation.md' - 'Installation': 'basic/installation.md'
- 'Docker Deployment': 'basic/docker-deploymeny.md' - 'Docker Deplotment': 'basic/docker-deploymeny.md'
- 'Quick Start': 'basic/quickstart.md' - 'Quick Start': 'basic/quickstart.md'
- Changelog & Blog: - Changelog & Blog:
- 'Blog Home': 'blog/index.md' - 'Blog Home': 'blog/index.md'

View File

@@ -57,6 +57,9 @@ setup(
author_email="unclecode@kidocode.com", author_email="unclecode@kidocode.com",
license="MIT", license="MIT",
packages=find_packages(), packages=find_packages(),
package_data={
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
},
install_requires=default_requirements install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles + ["playwright", "aiofiles"], # Added aiofiles
extras_require={ extras_require={